diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 766d582c0b51..1bfa44a3884d 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -1,146 +1,152 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_METASLAB_H #define _SYS_METASLAB_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct metaslab_ops { const char *msop_name; - uint64_t (*msop_alloc)(metaslab_t *, uint64_t); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; extern const metaslab_ops_t zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t); void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); boolean_t metaslab_unflushed_dirty(metaslab_t *); uint64_t metaslab_unflushed_txg(metaslab_t *); uint64_t metaslab_estimated_condensed_size(metaslab_t *); int metaslab_sort_by_flushed(const void *, const void *); void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t); uint64_t metaslab_unflushed_changes_memused(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags */ #define METASLAB_ZIL 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *); +int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, + int, const void *, uint64_t *); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); +void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *, boolean_t); void metaslab_class_destroy(metaslab_class_t *); void metaslab_class_validate(metaslab_class_t *); void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *, boolean_t, boolean_t *); boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); void metaslab_space_update(vdev_t *, metaslab_class_t *, int64_t, int64_t, int64_t); metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int, + uint64_t, const void *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t, boolean_t); void metaslab_set_selected_txg(metaslab_t *, uint64_t); extern int metaslab_debug_load; zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift); #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5aad22dba6b3..7f457c3a0b76 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -1,235 +1,237 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_VDEV_H #define _SYS_VDEV_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef enum vdev_dtl_type { DTL_MISSING, /* 0% replication: no copies of the data */ DTL_PARTIAL, /* less than 100% replication: some copies missing */ DTL_SCRUB, /* unable to fully repair during scrub/resilver */ DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ DTL_TYPES } vdev_dtl_type_t; extern int zfs_nocacheflush; typedef boolean_t vdev_open_children_func_t(vdev_t *vd); extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_concrete(vdev_t *vd); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); extern int vdev_count_leaves(spa_t *spa); extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx); extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size); extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, uint64_t offset, uint64_t size, dmu_tx_t *tx); extern boolean_t vdev_replace_in_progress(vdev_t *vdev); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, const char *tag); typedef void vdev_xlate_func_t(void *arg, zfs_range_seg64_t *physical_rs); extern boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs); extern void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); extern void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); extern boolean_t vdev_children_are_offline(vdev_t *vd); extern void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, + uint64_t txg); extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* * Return the amount of space allocated for a gang block header. Note that * since the physical birth txg is not provided, this must be constant for * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern int vdev_remove_wanted(spa_t *spa, uint64_t guid); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern boolean_t vdev_queue_pool_busy(spa_t *spa); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); extern void vdev_defer_resilver(vdev_t *vd); extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, VDEV_CONFIG_MOS = 1 << 2, VDEV_CONFIG_MISSING = 1 << 3 } vdev_config_flag_t; extern void vdev_post_kobj_evt(vdev_t *vd); extern void vdev_clear_kobj_evt(vdev_t *vd); extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); /* * Label routines */ struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); extern int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl); extern int vdev_prop_get(vdev_t *vd, nvlist_t *nvprops, nvlist_t *outnvl); #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_H */ diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index d44ab6681db9..e923092a39ad 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -1,112 +1,112 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #ifndef _SYS_VDEV_DRAID_H #define _SYS_VDEV_DRAID_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Constants required to generate and use dRAID permutations. */ #define VDEV_DRAID_SEED 0xd7a1d5eed #define VDEV_DRAID_MAX_MAPS 254 #define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT #define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT) #define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT) /* * dRAID permutation map. */ typedef struct draid_map { uint64_t dm_children; /* # of permutation columns */ uint64_t dm_nperms; /* # of permutation rows */ uint64_t dm_seed; /* dRAID map seed */ uint64_t dm_checksum; /* Checksum of generated map */ uint8_t *dm_perms; /* base permutation array */ } draid_map_t; /* * dRAID configuration. */ typedef struct vdev_draid_config { /* * Values read from the dRAID nvlist configuration. */ uint64_t vdc_ndata; /* # of data devices in group */ uint64_t vdc_nparity; /* # of parity devices in group */ uint64_t vdc_nspares; /* # of distributed spares */ uint64_t vdc_children; /* # of children */ uint64_t vdc_ngroups; /* # groups per slice */ /* * Immutable derived constants. */ uint8_t *vdc_perms; /* permutation array */ uint64_t vdc_nperms; /* # of permutations */ uint64_t vdc_groupwidth; /* = data + parity */ uint64_t vdc_ndisks; /* = children - spares */ uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */ uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */ } vdev_draid_config_t; /* * Functions for handling dRAID permutation maps. */ extern uint64_t vdev_draid_rand(uint64_t *); extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **); extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); /* * General dRAID support functions. */ extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); -extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t); extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *); extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); /* Functions for dRAID distributed spares. */ extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_DRAID_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a2a3e25d14cc..385d7224f2c5 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -1,668 +1,670 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; struct abd; /* * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ typedef void vdev_xlation_func_t(vdev_t *cvd, const zfs_range_seg64_t *logical, zfs_range_seg64_t *physical, zfs_range_seg64_t *remain); typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { vdev_init_func_t *vdev_op_init; vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; - vdev_asize_func_t *vdev_op_asize; + vdev_asize_func_t *vdev_op_psize_to_asize; + vdev_asize_func_t *vdev_op_asize_to_psize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; vdev_xlation_func_t *vdev_op_xlate; vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; vdev_metaslab_init_func_t *vdev_op_metaslab_init; vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ typedef union vdev_queue_class { struct { ulong_t vqc_list_numnodes; list_t vqc_list; }; avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ uint32_t vq_cqueued; /* Classes with queued I/Os. */ uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; }; typedef enum vdev_alloc_bias { VDEV_BIAS_NONE, VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ } vdev_alloc_bias_t; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ /* * Logical block alignment shift * * The smallest sized/aligned I/O supported by the device. */ uint64_t vdev_logical_ashift; /* * Physical block alignment shift * * The device supports logical I/Os with vdev_logical_ashift * size/alignment, but optimum performance will be achieved by * aligning/sizing requests to vdev_physical_ashift. Smaller * requests may be inflated or incur device level read-modify-write * operations. * * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). */ uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ int vdev_validate_error; /* error on last validate */ kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ uint64_t vdev_root_zap; /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ /* Initialize related */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; list_node_t vdev_initialize_node; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; /* valid while initializing */ zfs_range_tree_t *vdev_initialize_tree; uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; uint64_t vdev_initialize_action_time; /* start and end time */ /* TRIM related */ boolean_t vdev_trim_exit_wanted; boolean_t vdev_autotrim_exit_wanted; vdev_trim_state_t vdev_trim_state; list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; kcondvar_t vdev_trim_cv; kthread_t *vdev_trim_thread; uint64_t vdev_trim_offset[TXG_SIZE]; uint64_t vdev_trim_last_offset; uint64_t vdev_trim_bytes_est; uint64_t vdev_trim_bytes_done; uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ uint64_t vdev_trim_partial; /* requested partial TRIM */ uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ /* Rebuild related */ boolean_t vdev_rebuilding; boolean_t vdev_rebuild_exit_wanted; boolean_t vdev_rebuild_cancel_wanted; boolean_t vdev_rebuild_reset_wanted; kmutex_t vdev_rebuild_lock; kcondvar_t vdev_rebuild_cv; kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; zfs_range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). */ kmutex_t vdev_scan_io_queue_lock; struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. */ zfs_range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_enc_sysfs_path; /* enclosure sysfs path */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_has_trim; /* TRIM is supported */ boolean_t vdev_has_securetrim; /* secure TRIM is supported */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. * * We also rate limit Direct I/O write verify errors, since a user might * be continually manipulating a buffer that can flood ZED with tons of * events. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_dio_verify_rl; zfs_ratelimit_t vdev_checksum_rl; /* * Vdev properties for tuning ZED or zfsd */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock * ring when MMP is enabled. */ #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef enum vbe_vers { /* * The bootenv file is stored as ascii text in the envblock. * It is used by the GRUB bootloader used on Linux to store the * contents of the grubenv file. The file is stored as raw ASCII, * and is protected by an embedded checksum. By default, GRUB will * check if the boot filesystem supports storing the environment data * in a special location, and if so, will invoke filesystem specific * logic to retrieve it. This can be overridden by a variable, should * the user so desire. */ VB_RAW = 0, /* * The bootenv file is converted to an nvlist and then packed into the * envblock. */ VB_NVLIST = 1 } vbe_vers_t; typedef struct vdev_boot_envblock { uint64_t vbe_version; char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - sizeof (zio_eck_t)]; zio_eck_t vbe_zbt; } vdev_boot_envblock_t; _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE, "vdev_boot_envblock_t wrong size"); typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_OFFSET_IS_LABEL(vd, off) \ (((off) < VDEV_LABEL_START_SIZE) || \ ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_draid_ops; extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); +extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables */ extern int zfs_vdev_standard_sm_blksz; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj); extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); #if defined(__linux__) int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); #endif int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); /* * Vdev ashift optimization tunables */ extern uint_t zfs_vdev_min_auto_ashift; extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); /* * VDEV checksum verification for Direct I/O writes */ extern uint_t zfs_vdev_direct_write_verify; #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 78adca4d7d00..ea3809ce097b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -1,737 +1,738 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner * Copyright (c) 2024 by George Melikov. All rights reserved. */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, ZIO_CHECKSUM_BLAKE3, ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1U << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 #define ZIO_DATA_SALT_LEN 8 #define ZIO_DATA_MAC_LEN 16 /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_ON #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_GZIP_1 || \ (compress) == ZIO_COMPRESS_GZIP_2 || \ (compress) == ZIO_COMPRESS_GZIP_3 || \ (compress) == ZIO_COMPRESS_GZIP_4 || \ (compress) == ZIO_COMPRESS_GZIP_5 || \ (compress) == ZIO_COMPRESS_GZIP_6 || \ (compress) == ZIO_COMPRESS_GZIP_7 || \ (compress) == ZIO_COMPRESS_GZIP_8 || \ (compress) == ZIO_COMPRESS_GZIP_9 || \ (compress) == ZIO_COMPRESS_ZLE || \ (compress) == ZIO_COMPRESS_ZSTD || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) #define ZIO_COMPRESS_ALGO(x) (x & SPA_COMPRESSMASK) #define ZIO_COMPRESS_LEVEL(x) ((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS) #define ZIO_COMPRESS_RAW(type, level) (type | ((level) << SPA_COMPRESSBITS)) #define ZIO_COMPLEVEL_ZSTD(level) \ ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 typedef enum zio_suspend_reason { ZIO_SUSPEND_NONE = 0, ZIO_SUSPEND_IOERR, ZIO_SUSPEND_MMP, } zio_suspend_reason_t; /* * This was originally an enum type. However, those are 32-bit and there is no * way to make a 64-bit enum type. Since we ran out of bits for flags, we were * forced to upgrade it to a uint64_t. * * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER * FLAG. */ typedef uint64_t zio_flag_t; /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ #define ZIO_FLAG_DONT_AGGREGATE (1ULL << 0) #define ZIO_FLAG_IO_REPAIR (1ULL << 1) #define ZIO_FLAG_SELF_HEAL (1ULL << 2) #define ZIO_FLAG_RESILVER (1ULL << 3) #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ #define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ #define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */ #define ZIO_FLAG_PROBE (1ULL << 16) #define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_DIO_READ (1ULL << 19) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ #define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ #define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) #define ZIO_FLAG_IO_BYPASS (1ULL << 22) #define ZIO_FLAG_IO_REWRITE (1ULL << 23) #define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) #define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) #define ZIO_FLAG_GANG_CHILD (1ULL << 26) #define ZIO_FLAG_DDT_CHILD (1ULL << 27) #define ZIO_FLAG_GODFATHER (1ULL << 28) #define ZIO_FLAG_NOPWRITE (1ULL << 29) #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) #define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) #define ZIO_CHILD_BIT(x) (1U << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; #define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) #define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; typedef void zio_done_func_t(zio_t *zio); extern int zio_exclude_metadata; extern int zio_dva_throttle_enabled; extern const char *const zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). * * If the head_errlog feature is enabled a different on-disk format for error * logs is used. This introduces the use of an error bookmark, a four-tuple * that uniquely identifies any error block * in the pool. The birth transaction group is used to track whether the block * has been overwritten by newer data or added to a snapshot since its marking * as an error. */ struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; }; struct zbookmark_err_phys { uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; uint64_t zb_birth; }; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; uint8_t zp_level; uint8_t zp_copies; uint8_t zp_gang_copies; dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; boolean_t zp_direct_write; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; dmu_object_type_t zp_storage_type; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const abd_t *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, struct abd *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 /* * The io_trim flags are used to specify the type of TRIM to perform. They * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags. */ enum trim_flag { ZIO_TRIM_SECURE = 1U << 0, }; typedef struct zio_alloc_list { list_t zal_list; uint64_t zal_size; } zio_alloc_list_t; typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; enum zio_qstate { ZIO_QS_NONE = 0, ZIO_QS_QUEUED, ZIO_QS_ACTIVE, }; struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* io_lsize != io_orig_size iff this is a raw write */ uint64_t io_lsize; /* Data represented by this I/O */ struct abd *io_abd; struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ enum zio_qstate io_queue_state; /* vdev queue state */ union { list_node_t l; avl_node_t a; } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ zio_flag_t io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; zio_flag_t io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; void *io_bio; kmutex_t io_lock; kcondvar_t io_cv; int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; }; enum blk_verify_flag { BLK_VERIFY_ONLY, BLK_VERIFY_LOG, BLK_VERIFY_HALT }; enum blk_config_flag { BLK_CONFIG_HELD, // SCL_VDEV held for writer BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV }; extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, zio_flag_t flags); extern void zio_destroy(zio_t *zio); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies, boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); extern size_t zio_get_compression_max_size(enum zio_compress compress, uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(void *zio); extern void zio_interrupt(void *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, const char *tag); extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); extern void zio_dio_chksum_verify_error_report(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type); extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions */ extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr); extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 3da8976cae17..68aa2f2cb0c7 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1,5702 +1,5702 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright (c) 2021 Matt Fiddaman */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #include #endif /* HAVE_IDMAP */ #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_deleg.h" static __thread struct passwd gpwd; static __thread struct group ggrp; static __thread char rpbuf[2048]; static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human * readable form. */ const char * zfs_type_to_name(zfs_type_t type) { switch (type) { case ZFS_TYPE_FILESYSTEM: return (dgettext(TEXT_DOMAIN, "filesystem")); case ZFS_TYPE_SNAPSHOT: return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); case ZFS_TYPE_POOL: return (dgettext(TEXT_DOMAIN, "pool")); case ZFS_TYPE_BOOKMARK: return (dgettext(TEXT_DOMAIN, "bookmark")); default: assert(!"unhandled zfs_type_t"); } return (NULL); } /* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot delimiter '@' is not expected here")); return (0); } if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '@' delimiter in snapshot name")); return (0); } if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmark delimiter '#' is not expected here")); return (0); } if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '#' delimiter in bookmark name")); return (0); } if (modifying && strchr(path, '%') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character %c in name"), '%'); return (0); } if (entity_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component or misplaced '@'" " or '#' delimiter in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in name"), what); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool doesn't begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; case NAME_ERR_SELF_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "self reference, '.' is found in name")); break; case NAME_ERR_PARENT_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent reference, '..' is found in name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (0); } return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { if (type == ZFS_TYPE_POOL) return (zpool_name_valid(NULL, B_FALSE, name)); return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* * This function takes the raw DSL properties, and filters out the user-defined * properties into a separate nvlist. */ static nvlist_t * process_user_props(zfs_handle_t *zhp, nvlist_t *props) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvpair_t *elem; nvlist_t *nvl; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { if (!zfs_prop_user(nvpair_name(elem))) continue; nvlist_t *propval = fnvpair_value_nvlist(elem); if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { nvlist_free(nvl); (void) no_memory(hdl); return (NULL); } } return (nvl); } static zpool_handle_t * zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph; if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { if (hdl->libzfs_pool_handles != NULL) zph->zpool_next = hdl->libzfs_pool_handles; hdl->libzfs_pool_handles = zph; } return (zph); } static zpool_handle_t * zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph = hdl->libzfs_pool_handles; while ((zph != NULL) && (strncmp(pool_name, zpool_get_name(zph), len) != 0)) zph = zph->zpool_next; return (zph); } /* * Returns a handle to the pool that contains the provided dataset. * If a handle to that pool already exists then that handle is returned. * Otherwise, a new handle is created and added to the list of handles. */ static zpool_handle_t * zpool_handle(zfs_handle_t *zhp) { char *pool_name; int len; zpool_handle_t *zph; len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); zph = zpool_find_handle(zhp, pool_name, len); if (zph == NULL) zph = zpool_add_handle(zhp, pool_name); free(pool_name); return (zph); } void zpool_free_handles(libzfs_handle_t *hdl) { zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; while (zph != NULL) { next = zph->zpool_next; zpool_close(zph); zph = next; } hdl->libzfs_pool_handles = NULL; } /* * Utility function to gather stats (objset and zpl) for the given object. */ static int get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, zc); else return (-1); } return (0); } /* * Utility function to get the received properties of the given object. */ static int get_recvd_props_ioctl(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *recvdprops; zfs_cmd_t zc = {"\0"}; int err; zcmd_alloc_dst_nvlist(hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); zcmd_free_nvlists(&zc); if (err != 0) return (-1); nvlist_free(zhp->zfs_recvd_props); zhp->zfs_recvd_props = recvdprops; return (0); } static int put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) { nvlist_t *allprops, *userprops; zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } /* * XXX Why do we store the user props separately, in addition to * storing them in zfs_props? */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); } nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); zhp->zfs_props = allprops; zhp->zfs_user_props = userprops; return (0); } static int get_stats(zfs_handle_t *zhp) { int rc = 0; zfs_cmd_t zc = {"\0"}; zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) != 0) rc = -1; else if (put_stats_zhdl(zhp, &zc) != 0) rc = -1; zcmd_free_nvlists(&zc); return (rc); } /* * Refresh the properties currently stored in the handle. */ void zfs_refresh_properties(zfs_handle_t *zhp) { (void) get_stats(zhp); } /* * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ static int make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { if (put_stats_zhdl(zhp, zc) != 0) return (-1); /* * We've managed to open the dataset and gather statistics. Determine * the high-level type. */ if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) { zhp->zfs_head_type = ZFS_TYPE_VOLUME; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) { zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) { errno = EINVAL; return (-1); } else if (zhp->zfs_dmustats.dds_inconsistent) { errno = EBUSY; return (-1); } else { abort(); } if (zhp->zfs_dmustats.dds_is_snapshot) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; else abort(); /* we should never see any other types */ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) return (-1); return (0); } zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); zcmd_alloc_dst_nvlist(hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) == -1) { zcmd_free_nvlists(&zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, &zc) == -1) { free(zhp); zhp = NULL; } zcmd_free_nvlists(&zc); return (zhp); } zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); if (make_dataset_handle_common(zhp, zc) == -1) { free(zhp); return (NULL); } return (zhp); } zfs_handle_t * make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = pzhp->zfs_hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); zhp->zfs_head_type = pzhp->zfs_type; zhp->zfs_type = ZFS_TYPE_SNAPSHOT; zhp->zpool_hdl = zpool_handle(zhp); if (zc->zc_objset_stats.dds_creation_txg != 0) { /* structure assignment */ zhp->zfs_dmustats = zc->zc_objset_stats; } else { if (get_stats_ioctl(zhp, zc) == -1) { zcmd_free_nvlists(zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, zc) == -1) { zcmd_free_nvlists(zc); free(zhp); return (NULL); } } if (zhp->zfs_dmustats.dds_is_snapshot || strchr(zc->zc_name, '@') != NULL) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; return (zhp); } zfs_handle_t * zfs_handle_dup(zfs_handle_t *zhp_orig) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = zhp_orig->zfs_hdl; zhp->zpool_hdl = zhp_orig->zpool_hdl; (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, sizeof (zhp->zfs_name)); zhp->zfs_type = zhp_orig->zfs_type; zhp->zfs_head_type = zhp_orig->zfs_head_type; zhp->zfs_dmustats = zhp_orig->zfs_dmustats; if (zhp_orig->zfs_props != NULL) { if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_user_props != NULL) { if (nvlist_dup(zhp_orig->zfs_user_props, &zhp->zfs_user_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_recvd_props != NULL) { if (nvlist_dup(zhp_orig->zfs_recvd_props, &zhp->zfs_recvd_props, 0)) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; if (zhp_orig->zfs_mntopts != NULL) { zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, zhp_orig->zfs_mntopts); } zhp->zfs_props_table = zhp_orig->zfs_props_table; return (zhp); } boolean_t zfs_bookmark_exists(const char *path) { nvlist_t *bmarks; nvlist_t *props; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; char *pound; int err; boolean_t rv; (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) return (B_FALSE); *pound = '\0'; bmark_name = pound + 1; props = fnvlist_alloc(); err = lzc_get_bookmarks(fsname, props, &bmarks); nvlist_free(props); if (err != 0) { nvlist_free(bmarks); return (B_FALSE); } rv = nvlist_exists(bmarks, bmark_name); nvlist_free(bmarks); return (rv); } zfs_handle_t * make_bookmark_handle(zfs_handle_t *parent, const char *path, nvlist_t *bmark_props) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); /* Fill in the name. */ zhp->zfs_hdl = parent->zfs_hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); /* Set the property lists. */ if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { free(zhp); return (NULL); } /* Set the types. */ zhp->zfs_head_type = parent->zfs_head_type; zhp->zfs_type = ZFS_TYPE_BOOKMARK; if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { nvlist_free(zhp->zfs_props); free(zhp); return (NULL); } return (zhp); } struct zfs_open_bookmarks_cb_data { const char *path; zfs_handle_t *zhp; }; static int zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) { struct zfs_open_bookmarks_cb_data *dp = data; /* * Is it the one we are looking for? */ if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { /* * We found it. Save it and let the caller know we are done. */ dp->zhp = zhp; return (EEXIST); } /* * Not found. Close the handle and ask for another one. */ zfs_close(zhp); return (0); } /* * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an * appropriate error message and return NULL if it can't be opened. */ zfs_handle_t * zfs_open(libzfs_handle_t *hdl, const char *path, int types) { zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; char *bookp; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); /* * Validate the name before we even try to open it. */ if (!zfs_validate_name(hdl, path, types, B_FALSE)) { (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); errno = EINVAL; return (NULL); } /* * Bookmarks needs to be handled separately. */ bookp = strchr(path, '#'); if (bookp == NULL) { /* * Try to get stats for the dataset, which will tell us if it * exists. */ errno = 0; if ((zhp = make_dataset_handle(hdl, path)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } } else { char dsname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *pzhp; struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; /* * We need to cut out '#' and everything after '#' * to get the parent dataset name only. */ assert(bookp - path < sizeof (dsname)); (void) strlcpy(dsname, path, MIN(sizeof (dsname), bookp - path + 1)); /* * Create handle for the parent dataset. */ errno = 0; if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } /* * Iterate bookmarks to find the right one. */ errno = 0; if ((zfs_iter_bookmarks_v2(pzhp, 0, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); errno = ENOENT; return (NULL); } if (cb_data.zhp == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(pzhp); return (NULL); } zhp = cb_data.zhp; /* * Cleanup. */ zfs_close(pzhp); } if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); errno = EINVAL; return (NULL); } return (zhp); } /* * Release a ZFS handle. Nothing to do but free the associated memory. */ void zfs_close(zfs_handle_t *zhp) { if (zhp->zfs_mntopts) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); nvlist_free(zhp->zfs_recvd_props); free(zhp); } typedef struct mnttab_node { struct mnttab mtn_mt; avl_node_t mtn_node; } mnttab_node_t; static int libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) { const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; int rv; rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); return (TREE_ISIGN(rv)); } void libzfs_mnttab_init(libzfs_handle_t *hdl) { pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } static int libzfs_mnttab_update(libzfs_handle_t *hdl) { FILE *mnttab; struct mnttab entry; if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (getmntent(mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); /* Exclude duplicate mounts */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); continue; } avl_add(&hdl->libzfs_mnttab_cache, mtn); } (void) fclose(mnttab); return (0); } void libzfs_mnttab_fini(libzfs_handle_t *hdl) { void *cookie = NULL; mnttab_node_t *mtn; while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) { hdl->libzfs_mnttab_enable = enable; } int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { FILE *mnttab; mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = (char *)MNTTYPE_ZFS; ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0; (void) fclose(mnttab); return (ret); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { int error; if ((error = libzfs_mnttab_update(hdl)) != 0) { pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (error); } } find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; ret = 0; } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (ret); } void libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, const char *mountp, const char *mntopts) { mnttab_node_t *mtn; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); /* * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) { mnttab_node_t find; mnttab_node_t *ret; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); free(ret->mtn_mt.mnt_fstype); free(ret->mtn_mt.mnt_mntopts); free(ret); } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { zpool_handle_t *zpool_handle = zhp->zpool_hdl; if (zpool_handle == NULL) return (-1); *spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); return (0); } /* * The choice of reservation property depends on the SPA version. */ static int zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) { int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); if (spa_version >= SPA_VERSION_REFRESERVATION) *resv_prop = ZFS_PROP_REFRESERVATION; else *resv_prop = ZFS_PROP_RESERVATION; return (0); } /* * Given an nvlist of properties to set, validates that they are correct, and * parses any numeric properties (index, boolean, etc) if they are specified as * strings. */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, boolean_t key_params_ok, const char *errbuf) { nvpair_t *elem; uint64_t intval; const char *strval; zfs_prop_t prop; nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; int set_maxbs = 0; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } /* * Make sure this property is valid and applies to this type. */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); if (prop == ZPROP_USERPROP && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (nvlist_add_string(ret, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Currently, only user properties can be modified on * snapshots. */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (prop == ZPROP_USERPROP && zfs_prop_userquota(propname)) { zfs_userquota_prop_t uqtype; char *newpropname = NULL; char domain[128]; uint64_t rid; uint64_t valary[3]; int rc; if (userquota_propname_decode(propname, zoned, &uqtype, domain, sizeof (domain), &rid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' has an invalid user/group name"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (uqtype != ZFS_PROP_USERQUOTA && uqtype != ZFS_PROP_GROUPQUOTA && uqtype != ZFS_PROP_USEROBJQUOTA && uqtype != ZFS_PROP_GROUPOBJQUOTA && uqtype != ZFS_PROP_PROJECTQUOTA && uqtype != ZFS_PROP_PROJECTOBJQUOTA) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (nvpair_type(elem) == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &strval); if (strcmp(strval, "none") == 0) { intval = 0; } else if (zfs_nicestrtonum(hdl, strval, &intval) != 0) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, &intval); if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable " "{user|group|project}quota")); goto error; } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * Encode the prop name as * userquota@-domain, to make it easy * for the kernel to decode. */ rc = asprintf(&newpropname, "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], (longlong_t)rid, domain); if (rc == -1 || newpropname == NULL) { (void) no_memory(hdl); goto error; } valary[0] = uqtype; valary[1] = rid; valary[2] = intval; if (nvlist_add_uint64_array(ret, newpropname, valary, 3) != 0) { free(newpropname); (void) no_memory(hdl); goto error; } free(newpropname); continue; } else if (prop == ZPROP_USERPROP && zfs_prop_written(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " "apply to datasets of this type"), propname); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (zfs_prop_readonly(prop) && !(zfs_prop_setonce(prop) && zhp == NULL) && !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, type, ret, &strval, &intval, errbuf) != 0) goto error; /* * Perform some additional checks for specific properties. */ switch (prop) { case ZFS_PROP_VERSION: { int version; if (zhp == NULL) break; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (intval < version) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can not downgrade; already at version %u"), version); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: { int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); } /* * The value must be a power of two between * SPA_MINBLOCKSIZE and maxbs. */ if (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval)) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be power of 2 from 512B " "to %s"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* save the ZFS_PROP_RECORDSIZE during create op */ if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) { set_maxbs = intval; } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { int maxbs = set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs; char buf[64]; if (zpool_hdl != NULL) { char state[64] = ""; maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); /* * Issue a warning but do not fail so that * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, sizeof (state)) != 0 || strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { (void) fprintf(stderr, gettext( "%s: property requires a special " "device in the pool\n"), propname); } } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval))) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid '%s=%llu' property: must be zero " "or a power of 2 from 512B to %s"), propname, (unsigned long long)intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL /* * Verify the mlslabel string and convert to * internal hex label string. */ m_label_t *new_sl; char *hex = NULL; /* internal label string */ /* Default value is already OK. */ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) break; /* Verify the label can be converted to binary form */ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || (str_to_label(strval, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1)) { goto badlabel; } /* Now translate to hex internal label string */ if (label_to_str(new_sl, &hex, M_INTERNAL, DEF_NAMES) != 0) { if (hex) free(hex); goto badlabel; } m_label_free(new_sl); /* If string is already in internal form, we're done. */ if (strcmp(strval, hex) == 0) { free(hex); break; } /* Replace the label string with the internal form. */ (void) nvlist_remove(ret, zfs_prop_to_name(prop), DATA_TYPE_STRING); fnvlist_add_string(ret, zfs_prop_to_name(prop), hex); free(hex); break; badlabel: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ goto error; #else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mlslabels are unsupported")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; #endif /* HAVE_MLSLABEL */ } case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) break; if (mountpoint_namecheck(strval, &why)) { switch (why) { case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be an absolute path, " "'none', or 'legacy'"), propname); break; case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "component of '%s' is too long"), propname); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } zfs_fallthrough; } case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* * For the mountpoint and sharenfs or sharesmb * properties, check if it can be set in a * global/non-global zone based on * the zoned property value: * * global zone non-global zone * -------------------------------------------------- * zoned=on mountpoint (no) mountpoint (yes) * sharenfs (no) sharenfs (no) * sharesmb (no) sharesmb (no) * * zoned=off mountpoint (yes) N/A * sharenfs (yes) * sharesmb (yes) */ if (zoned) { if (getzoneid() == GLOBAL_ZONEID) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set on " "dataset in a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } else if (prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set in " "a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " "'zoned' property is set"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } /* * At this point, it is legitimate to set the * property. Now we want to make sure that the * property value is valid if it is sharenfs. */ if ((prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) && strcmp(strval, "on") != 0 && strcmp(strval, "off") != 0) { enum sa_protocol proto; if (prop == ZFS_PROP_SHARESMB) proto = SA_PROTOCOL_SMB; else proto = SA_PROTOCOL_NFS; if (sa_validate_shareopts(strval, proto) != SA_OK) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_KEYLOCATION: if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid keylocation")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zhp != NULL) { uint64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF && strcmp(strval, "none") != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must be 'none' " "for unencrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (crypt != ZIO_CRYPT_OFF && strcmp(strval, "none") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must not be 'none' " "for encrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_PBKDF2_ITERS: if (intval < MIN_PBKDF2_ITERATIONS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "minimum pbkdf2 iterations is %u"), MIN_PBKDF2_ITERATIONS); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; default: break; } /* * For changes to existing volumes, we have some additional * checks to enforce. */ if (type == ZFS_TYPE_VOLUME && zhp != NULL) { uint64_t blocksize = zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE); char buf[64]; switch (prop) { case ZFS_PROP_VOLSIZE: if (intval % blocksize != 0) { zfs_nicebytes(blocksize, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a multiple of " "volume block size (%s)"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be zero"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } /* check encryption properties */ if (zhp != NULL) { int64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); switch (prop) { case ZFS_PROP_COPIES: if (crypt != ZIO_CRYPT_OFF && intval > 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encrypted datasets cannot have " "3 copies")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } } /* * If normalization was chosen, but no UTF8 choice was made, * enforce rejection of non-UTF8 names. * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. * * If utf8only was turned off, but the parent has normalization, * turn off normalization. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { (void) no_memory(hdl); goto error; } } else if (chosen_normal > 0 && chosen_utf == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be set 'on' if normalization chosen"), zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (chosen_normal < 0 && chosen_utf == 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_NORMALIZE), 0) != 0) { (void) no_memory(hdl); goto error; } } return (ret); error: nvlist_free(ret); return (NULL); } static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; uint64_t new_volsize; uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &new_volsize) != 0) { fnvlist_free(props); return (-1); } new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t volsize; uint64_t resvsize; zfs_prop_t prop; nvlist_t *props; if (!ZFS_IS_VOLUME(zhp)) { return (0); } if (zfs_which_resv_prop(zhp, &prop) != 0) { return (-1); } if (prop != ZFS_PROP_REFRESERVATION) { return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { /* No value being set, so it can't be "auto" */ return (0); } if (resvsize != UINT64_MAX) { /* Being set to a value other than "auto" */ return (0); } props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * Given a property name and value, set the property for the given dataset. */ int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { int ret = -1; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_string(nvl, propname, propval) != 0) { (void) no_memory(hdl); goto error; } ret = zfs_prop_set_list(zhp, nvl); error: nvlist_free(nvl); return (ret); } /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) { return (zfs_prop_set_list_flags(zhp, props, 0)); } /* * Given an nvlist of property names, values and flags, set the properties * for the given dataset. If ZFS_SET_NOMOUNT is set, it allows to update * mountpoint, sharenfs and sharesmb properties without (un/re)mounting * and (un/re)sharing the dataset. */ int zfs_prop_set_list_flags(zfs_handle_t *zhp, nvlist_t *props, int flags) { zfs_cmd_t zc = {"\0"}; int ret = -1; prop_changelist_t **cls = NULL; int cl_idx; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; zfs_prop_t prop; boolean_t nsprop = B_FALSE; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, B_FALSE, errbuf)) == NULL) goto error; /* * We have to check for any extra properties which need to be added * before computing the length of the nvlist. */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { goto error; } } if (added_resv != 1 && (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { goto error; } /* * Check how many properties we're setting and allocate an array to * store changelist pointers for postfix(). */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) nvl_len++; if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; cl_idx = 0; for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); nsprop |= zfs_is_namespace_prop(prop); assert(cl_idx < nvl_len); /* * We don't want to unmount & remount the dataset when changing * its canmount property to 'on' or 'noauto'. We only use * the changelist logic to unmount when setting canmount=off. */ if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { cls[cl_idx] = changelist_gather(zhp, prop, ((flags & ZFS_SET_NOMOUNT) ? CL_GATHER_DONT_UNMOUNT : 0), 0); if (cls[cl_idx] == NULL) goto error; } if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cls[cl_idx])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if (cls[cl_idx] != NULL && (ret = changelist_prefix(cls[cl_idx])) != 0) goto error; cl_idx++; } assert(cl_idx == nvl_len); /* * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(hdl, &zc, nvl); zcmd_alloc_dst_nvlist(hdl, &zc, 0); ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { if (zc.zc_nvlist_dst_filled == B_FALSE) { (void) zfs_standard_error(hdl, errno, errbuf); goto error; } /* Get the list of unset properties back and report them. */ nvlist_t *errorprops = NULL; if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) goto error; for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); elem != NULL; elem = nvlist_next_nvpair(errorprops, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); zfs_setprop_error(hdl, prop, errno, errbuf); } nvlist_free(errorprops); if (added_resv && errno == ENOSPC) { /* clean up the volsize property we tried to set */ uint64_t old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); nvlist_free(nvl); nvl = NULL; zcmd_free_nvlists(&zc); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) goto error; if (nvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), old_volsize) != 0) goto error; zcmd_write_src_nvlist(hdl, &zc, nvl); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); } } else { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) { int clp_err = changelist_postfix(cls[cl_idx]); if (clp_err != 0) ret = clp_err; } } if (ret == 0) { /* * Refresh the statistics so the new property * value is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (nsprop && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); if (cls != NULL) { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) changelist_free(cls[cl_idx]); } free(cls); } return (ret); } /* * Given a property, inherit the value from the parent dataset, or if received * is TRUE, revert to the received value, if any. */ int zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = {"\0"}; int ret; prop_changelist_t *cl; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; zfs_prop_t prop; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_USERPROP) { /* * For user properties, the amount of work we have to do is very * small, so just do it here. */ if (!zfs_prop_user(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); (void) get_stats(zhp); return (0); } /* * Verify that this property is inheritable. */ if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* * Check to see if the value applies to this type */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Determine datasets which will be affected by this change, if any. */ if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) return (-1); if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) { changelist_free(cl); return (zfs_standard_error(hdl, errno, errbuf)); } else { if ((ret = changelist_postfix(cl)) != 0) goto error; /* * Refresh the statistics so the new property is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } error: changelist_free(cl); return (ret); } /* * True DSL properties are stored in an nvlist. The following two functions * extract them appropriately. */ uint64_t getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, const char **source) { nvlist_t *nv; uint64_t value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = ""; } return (value); } static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, const char **source) { nvlist_t *nv; const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_string(prop); *source = ""; } return (value); } static boolean_t zfs_is_recvd_props_mode(zfs_handle_t *zhp) { return (zhp->zfs_props != NULL && zhp->zfs_props == zhp->zfs_recvd_props); } static void zfs_set_recvd_props_mode(zfs_handle_t *zhp, uintptr_t *cookie) { *cookie = (uintptr_t)zhp->zfs_props; zhp->zfs_props = zhp->zfs_recvd_props; } static void zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uintptr_t *cookie) { zhp->zfs_props = (nvlist_t *)*cookie; *cookie = 0; } /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. * * Certain properties can be overridden using 'mount -o'. In this case, scan * the contents of the /proc/self/mounts entry, searching for the * appropriate options. If they differ from the on-disk values, report the * current values and mark the source "temporary". */ static int get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, const char **source, uint64_t *val) { zfs_cmd_t zc = {"\0"}; nvlist_t *zplprops = NULL; struct mnttab mnt; const char *mntopt_on = NULL; const char *mntopt_off = NULL; boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; /* * If the property is being fetched for a snapshot, check whether * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { *val = zfs_prop_default_numeric(prop); return (-1); } switch (prop) { case ZFS_PROP_ATIME: mntopt_on = MNTOPT_ATIME; mntopt_off = MNTOPT_NOATIME; break; case ZFS_PROP_RELATIME: mntopt_on = MNTOPT_RELATIME; mntopt_off = MNTOPT_NORELATIME; break; case ZFS_PROP_DEVICES: mntopt_on = MNTOPT_DEVICES; mntopt_off = MNTOPT_NODEVICES; break; case ZFS_PROP_EXEC: mntopt_on = MNTOPT_EXEC; mntopt_off = MNTOPT_NOEXEC; break; case ZFS_PROP_READONLY: mntopt_on = MNTOPT_RO; mntopt_off = MNTOPT_RW; break; case ZFS_PROP_SETUID: mntopt_on = MNTOPT_SETUID; mntopt_off = MNTOPT_NOSETUID; break; case ZFS_PROP_XATTR: mntopt_on = MNTOPT_XATTR; mntopt_off = MNTOPT_NOXATTR; break; case ZFS_PROP_NBMAND: mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; default: break; } /* * Because looking up the mount options is potentially expensive * (iterating over all of /proc/self/mounts), we defer its * calculation until we're looking up a property which requires * its presence. */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); zhp->zfs_mntcheck = B_TRUE; } if (zhp->zfs_mntopts == NULL) mnt.mnt_mntopts = (char *)""; else mnt.mnt_mntopts = zhp->zfs_mntopts; switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: #ifndef __FreeBSD__ case ZFS_PROP_XATTR: #endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); if (received) break; if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) *src = ZPROP_SRC_TEMPORARY; } else if (hasmntopt(&mnt, mntopt_off) && *val) { *val = B_FALSE; if (src) *src = ZPROP_SRC_TEMPORARY; } break; case ZFS_PROP_CANMOUNT: case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { /* not default, must be local */ *source = zhp->zfs_name; } break; case ZFS_PROP_MOUNTED: *val = (zhp->zfs_mntopts != NULL); break; case ZFS_PROP_NUMCLONES: *val = zhp->zfs_dmustats.dds_num_clones; break; case ZFS_PROP_VERSION: case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: case ZFS_PROP_CASE: case ZFS_PROP_DEFAULTUSERQUOTA: case ZFS_PROP_DEFAULTGROUPQUOTA: case ZFS_PROP_DEFAULTPROJECTQUOTA: case ZFS_PROP_DEFAULTUSEROBJQUOTA: case ZFS_PROP_DEFAULTGROUPOBJQUOTA: case ZFS_PROP_DEFAULTPROJECTOBJQUOTA: zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); if (prop == ZFS_PROP_VERSION && zhp->zfs_type == ZFS_TYPE_VOLUME) *val = zfs_prop_default_numeric(prop); return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); return (-1); } nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; case ZFS_PROP_INCONSISTENT: *val = zhp->zfs_dmustats.dds_inconsistent; break; case ZFS_PROP_REDACTED: *val = zhp->zfs_dmustats.dds_redacted; break; case ZFS_PROP_GUID: if (zhp->zfs_dmustats.dds_guid != 0) *val = zhp->zfs_dmustats.dds_guid; else *val = getprop_uint64(zhp, prop, source); break; case ZFS_PROP_CREATETXG: /* * We can directly read createtxg property from zfs * handle for Filesystem, Snapshot and ZVOL types. */ if (((zhp->zfs_type == ZFS_TYPE_FILESYSTEM) || (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) || (zhp->zfs_type == ZFS_TYPE_VOLUME)) && (zhp->zfs_dmustats.dds_creation_txg != 0)) { *val = zhp->zfs_dmustats.dds_creation_txg; break; } else { *val = getprop_uint64(zhp, prop, source); } zfs_fallthrough; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* * If we tried to use a default value for a * readonly property, it means that it was not * present. Note this only applies to "truly" * readonly properties, not set-once properties * like volblocksize. */ if (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop) && *source != NULL && (*source)[0] == '\0') { *source = NULL; return (-1); } break; case PROP_TYPE_STRING: default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "internal error"))); } } return (0); } /* * Calculate the source type, given the raw source string. */ static void get_source(zfs_handle_t *zhp, zprop_source_t *srctype, const char *source, char *statbuf, size_t statlen) { if (statbuf == NULL || srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { return; } if (source == NULL) { *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; } else { (void) strlcpy(statbuf, source, statlen); *srctype = ZPROP_SRC_INHERITED; } } } int zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, size_t proplen, boolean_t literal) { zfs_prop_t prop; int err = 0; if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (-1); prop = zfs_name_to_prop(propname); if (prop != ZPROP_USERPROP) { uintptr_t cookie; if (!nvlist_exists(zhp->zfs_recvd_props, propname)) return (-1); zfs_set_recvd_props_mode(zhp, &cookie); err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); } else { nvlist_t *propval; const char *recvdval; if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, propname, &propval) != 0) return (-1); recvdval = fnvlist_lookup_string(propval, ZPROP_VALUE); (void) strlcpy(propbuf, recvdval, proplen); } return (err == 0 ? 0 : -1); } static int get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; nvpair_t *pair; value = zfs_get_clones_nvl(zhp); if (value == NULL || nvlist_empty(value)) return (-1); propbuf[0] = '\0'; for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; pair = nvlist_next_nvpair(value, pair)) { if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) strlcat(propbuf, nvpair_name(pair), proplen); } return (0); } struct get_clones_arg { uint64_t numclones; nvlist_t *value; const char *origin; char buf[ZFS_MAX_DATASET_NAME_LEN]; }; static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; if (gca->numclones == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), NULL, NULL, 0, B_TRUE) != 0) goto out; if (strcmp(gca->buf, gca->origin) == 0) { fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); gca->numclones--; } out: (void) zfs_iter_children_v2(zhp, 0, get_clones_cb, gca); zfs_close(zhp); return (0); } nvlist_t * zfs_get_clones_nvl(zfs_handle_t *zhp) { nvlist_t *nv, *value; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { struct get_clones_arg gca; /* * if this is a snapshot, then the kernel wasn't able * to get the clones. Do it by slowly iterating. */ if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) return (NULL); if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) return (NULL); if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { nvlist_free(nv); return (NULL); } gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); gca.value = value; gca.origin = zhp->zfs_name; if (gca.numclones != 0) { zfs_handle_t *root; char pool[ZFS_MAX_DATASET_NAME_LEN]; char *cp = pool; /* get the pool name */ (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); (void) strsep(&cp, "/@"); root = zfs_open(zhp->zfs_hdl, pool, ZFS_TYPE_FILESYSTEM); if (root == NULL) { nvlist_free(nv); nvlist_free(value); return (NULL); } (void) get_clones_cb(root, &gca); } if (gca.numclones != 0 || nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || nvlist_add_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { nvlist_free(nv); nvlist_free(value); return (NULL); } nvlist_free(nv); nvlist_free(value); nv = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES)); } return (fnvlist_lookup_nvlist(nv, ZPROP_VALUE)); } static int get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; uint64_t *snaps; uint_t nsnaps; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) return (-1); if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, &nsnaps) != 0) return (-1); if (nsnaps == 0) { /* There's no redaction snapshots; pass a special value back */ (void) snprintf(propbuf, proplen, "none"); return (0); } propbuf[0] = '\0'; for (int i = 0; i < nsnaps; i++) { char buf[128]; if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)snaps[i]); (void) strlcat(propbuf, buf, proplen); } return (0); } /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are * not equal, print both of them. */ static void zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, const char *strval) { if (!zhp->zfs_hdl->libzfs_prop_debug) return; int error; char *poolname = zhp->zpool_hdl->zpool_name; const char *prop_name = zfs_prop_to_name(prop); const char *program = "args = ...\n" "ds = args['dataset']\n" "prop = args['property']\n" "value, setpoint = zfs.get_prop(ds, prop)\n" "return {value=value, setpoint=setpoint}\n"; nvlist_t *outnvl; nvlist_t *retnvl; nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); error = lzc_channel_program_nosync(poolname, program, 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); if (error == 0) { retnvl = fnvlist_lookup_nvlist(outnvl, "return"); if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { int64_t ans; error = nvlist_lookup_int64(retnvl, "value", &ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (ans != intval) { (void) fprintf(stderr, "%s: zfs found %llu, " "but zcp found %llu\n", prop_name, (u_longlong_t)intval, (u_longlong_t)ans); } } else { const char *str_ans; error = nvlist_lookup_string(retnvl, "value", &str_ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (strcmp(strval, str_ans) != 0) { (void) fprintf(stderr, "%s: zfs found '%s', but zcp found '%s'\n", prop_name, strval, str_ans); } } } else { (void) fprintf(stderr, "%s: zcp check failed, channel program " "error: %u\n", prop_name, error); } nvlist_free(argnvl); nvlist_free(outnvl); } /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a * human-readable form. * * Returns 0 on success, or -1 on error. */ int zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) { const char *source = NULL; uint64_t val; const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (-1); if (received && zfs_prop_readonly(prop)) return (-1); if (src) *src = ZPROP_SRC_NONE; switch (prop) { case ZFS_PROP_CREATION: /* * 'creation' is a time_t stored in the statistics. We convert * this into a string unless 'literal' is specified. */ { val = getprop_uint64(zhp, prop, &source); time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_MOUNTPOINT: /* * Getting the precise mountpoint can be tricky. * * - for 'none' or 'legacy', return those values. * - for inherited mountpoints, we want to take everything * after our ancestor and append it to the inherited value. * * If the pool has an alternate root, we want to prepend that * root to any values we return. */ str = getprop_string(zhp, prop, &source); if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { relpath = zhp->zfs_name + strlen(source); if (relpath[0] == '/') relpath++; } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ if (str[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) str++; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", root, str); else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, relpath[0] == '@' ? "" : "/", relpath); } else { /* 'legacy' or 'none' */ (void) strlcpy(propbuf, str, proplen); } zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_ORIGIN: if (*zhp->zfs_dmustats.dds_origin != '\0') { str = (char *)&zhp->zfs_dmustats.dds_origin; } else { str = getprop_string(zhp, prop, &source); } if (str == NULL || *str == '\0') str = zfs_prop_default_string(prop); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case ZFS_PROP_REDACT_SNAPS: if (get_rsnaps_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If quota or reservation is 0, we translate this into 'none' * (unless literal is set), and indicate that it's the default * value. Otherwise, we print the number nicely and indicate * that its set locally. */ if (val == 0) { if (literal) (void) strlcpy(propbuf, "0", proplen); else (void) strlcpy(propbuf, "none", proplen); } else { if (literal) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); else zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If limit is UINT64_MAX, we translate this into 'none', and * indicate that it's the default value. Otherwise, we print * the number nicely and indicate that it's set locally. */ if (val == UINT64_MAX) { (void) strlcpy(propbuf, "none", proplen); } else if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) (void) snprintf(propbuf, proplen, "%llu.%02llu", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); else (void) snprintf(propbuf, proplen, "%llu.%02llux", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_TYPE: switch (zhp->zfs_type) { case ZFS_TYPE_FILESYSTEM: str = "filesystem"; break; case ZFS_TYPE_VOLUME: str = "volume"; break; case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; case ZFS_TYPE_BOOKMARK: str = "bookmark"; break; default: abort(); } (void) snprintf(propbuf, proplen, "%s", str); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MOUNTED: /* * The 'mounted' property is a pseudo-property that described * whether the filesystem is currently mounted. Even though * it's a boolean value, the typical values of "on" and "off" * don't make sense, so we translate to "yes" and "no". */ if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source, &val) != 0) return (-1); if (val) (void) strlcpy(propbuf, "yes", proplen); else (void) strlcpy(propbuf, "no", proplen); break; case ZFS_PROP_NAME: /* * The 'name' property is a pseudo-property derived from the * dataset name. It is presented as a real property to simplify * consumers. */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); if (literal || (strcasecmp(propbuf, ZFS_MLSLABEL_DEFAULT) == 0)) break; /* * Try to translate the internal hex string to * human-readable output. If there are any * problems just use the hex string. */ if (str_to_label(propbuf, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1) { m_label_free(new_sl); break; } if (label_to_str(new_sl, &ascii, M_LABEL, DEF_NAMES) != 0) { if (ascii) free(ascii); m_label_free(new_sl); break; } m_label_free(new_sl); (void) strlcpy(propbuf, ascii, proplen); free(ascii); #else (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); #endif /* HAVE_MLSLABEL */ } break; case ZFS_PROP_GUID: case ZFS_PROP_KEY_GUID: case ZFS_PROP_IVSET_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: case ZFS_PROP_PBKDF2_ITERS: /* * These properties are stored as numbers, but they are * identifiers or counters. * We don't want them to be pretty printed, because pretty * printing truncates their values making them useless. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFERENCED: case ZFS_PROP_AVAILABLE: case ZFS_PROP_USED: case ZFS_PROP_USEDSNAP: case ZFS_PROP_USEDDS: case ZFS_PROP_USEDREFRESERV: case ZFS_PROP_USEDCHILD: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_SNAPSHOTS_CHANGED: { if ((get_numeric_property(zhp, prop, src, &source, &val) != 0) || val == 0) { return (-1); } time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M:%S %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) { return (-1); } if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case PROP_TYPE_STRING: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case PROP_TYPE_INDEX: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (zfs_prop_index_to_string(prop, val, &strval) != 0) return (-1); (void) strlcpy(propbuf, strval, proplen); zcp_check(zhp, prop, 0, strval); break; default: abort(); } } get_source(zhp, src, source, statbuf, statlen); return (0); } /* * Utility function to get the given numeric property. Does no validation that * the given property is the appropriate type; should only be used with * hard-coded property types. */ uint64_t zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) { const char *source; uint64_t val = 0; (void) get_numeric_property(zhp, prop, NULL, &source, &val); return (val); } static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } /* * Similar to zfs_prop_get(), but returns the value as an integer. */ int zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, zprop_source_t *src, char *statbuf, size_t statlen) { const char *source; /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, dgettext(TEXT_DOMAIN, "cannot get property '%s'"), zfs_prop_to_name(prop))); } if (src) *src = ZPROP_SRC_NONE; if (get_numeric_property(zhp, prop, src, &source, value) != 0) return (-1); get_source(zhp, src, source, statbuf, statlen); return (0); } #ifdef HAVE_IDMAP static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { err = idmap_get_sidbyuid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } else { err = idmap_get_sidbygid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } if (err == IDMAP_SUCCESS && idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && status == IDMAP_SUCCESS) err = 0; else err = EINVAL; out: if (get_hdl) idmap_get_destroy(get_hdl); return (err); } #endif /* HAVE_IDMAP */ /* * convert the propname into parameters needed by kernel * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 */ static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) { zfs_userquota_prop_t type; char *cp; boolean_t isuser; boolean_t isgroup; boolean_t isproject; struct passwd *pw; struct group *gr; domain[0] = '\0'; /* Figure out the property type ({user|group|project}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } if (type == ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_GROUPOBJUSED); isproject = (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED); cp = strchr(propname, '@') + 1; if (isuser && getpwnam_r(cp, &gpwd, rpbuf, sizeof (rpbuf), &pw) == 0 && pw != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = pw->pw_uid; } else if (isgroup && getgrnam_r(cp, &ggrp, rpbuf, sizeof (rpbuf), &gr) == 0 && gr != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = gr->gr_gid; } else if (!isproject && strchr(cp, '@')) { #ifdef HAVE_IDMAP /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ directory_error_t e; char *numericsid = NULL; char *end; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { e = directory_sid_from_user_name(NULL, cp, &numericsid); } else { e = directory_sid_from_group_name(NULL, cp, &numericsid); } if (e != NULL) { directory_error_free(e); return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); free(numericsid); if (errno != 0 || *end != '\0') return (EINVAL); #else (void) domainlen; return (ENOSYS); #endif /* HAVE_IDMAP */ } else { /* It's a user/group/project ID (eg "12345"). */ uid_t id; char *end; id = strtoul(cp, &end, 10); if (*end != '\0') return (EINVAL); if (id > MAXUID && !isproject) { #ifdef HAVE_IDMAP /* It's an ephemeral ID. */ idmap_rid_t rid; char *mapdomain; if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { *ridp = id; } } return (0); } static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue, zfs_userquota_prop_t *typep) { int err; zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); zc.zc_objset_type = *typep; if (err) return (err); err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { zfs_userquota_prop_t type; return (zfs_prop_get_userquota_common(zhp, propname, propvalue, &type)); } int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; zfs_userquota_prop_t type; err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, &type); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else if (propvalue == 0 && (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTOBJQUOTA)) { (void) strlcpy(propbuf, "none", proplen); } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(propvalue, propbuf, proplen); } else { zfs_nicenum(propvalue, propbuf, proplen); } return (0); } /* * propname must start with "written@" or "written#". */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { int err; zfs_cmd_t zc = {"\0"}; const char *snapname; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); assert(zfs_prop_written(propname)); snapname = propname + strlen("written@"); if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ char *cp; (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; err = zfs_prop_get_written_int(zhp, propname, &propvalue); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else { zfs_nicebytes(propvalue, propbuf, proplen); } return (0); } /* * Returns the name of the given zfs handle. */ const char * zfs_get_name(const zfs_handle_t *zhp) { return (zhp->zfs_name); } /* * Returns the name of the parent pool for the given zfs handle. */ const char * zfs_get_pool_name(const zfs_handle_t *zhp) { return (zhp->zpool_hdl->zpool_name); } /* * Returns the type of the given zfs handle. */ zfs_type_t zfs_get_type(const zfs_handle_t *zhp) { return (zhp->zfs_type); } /* * Returns the type of the given zfs handle, * or, if a snapshot, the type of the snapshotted dataset. */ zfs_type_t zfs_get_underlying_type(const zfs_handle_t *zhp) { return (zhp->zfs_head_type); } /* * Is one dataset name a child dataset of another? * * Needs to handle these cases: * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" * Descendant? No. No. No. Yes. */ static boolean_t is_descendant(const char *ds1, const char *ds2) { size_t d1len = strlen(ds1); /* ds2 can't be a descendant if it's smaller */ if (strlen(ds2) < d1len) return (B_FALSE); /* otherwise, compare strings and verify that there's a '/' char */ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); } /* * Given a complete name, return just the portion that refers to the parent. * Will return -1 if there is no parent (path is just the name of the * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { char *slashp; (void) strlcpy(buf, path, buflen); if ((slashp = strrchr(buf, '/')) == NULL) return (-1); *slashp = '\0'; return (0); } int zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) { return (parent_name(zfs_get_name(zhp), buf, buflen)); } /* * If accept_ancestor is false, then check to make sure that the given path has * a parent, and that it exists. If accept_ancestor is true, then find the * closest existing ancestor for the given path. In prefixlen return the * length of already existing prefix of the given path. We also fetch the * 'zoned' property, which is used to validate property settings when creating * new datasets. */ static int check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = {"\0"}; char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; uint64_t is_zoned; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* check to see if the pool exists */ if ((slash = strchr(parent, '/')) == NULL) slash = parent + strlen(parent); (void) strlcpy(zc.zc_name, parent, MIN(sizeof (zc.zc_name), slash - parent + 1)); if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* check to see if the parent dataset exists */ while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { if (errno == ENOENT && accept_ancestor) { /* * Go deeper to find an ancestor, give up on top level. */ if (parent_name(parent, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } } else if (errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent does not exist")); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } else return (zfs_standard_error(hdl, errno, errbuf)); } is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned != NULL) *zoned = is_zoned; /* we are in a non-global zone, but parent is in the global zone */ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); } /* make sure parent is a filesystem */ if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent is not a filesystem")); (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (-1); } zfs_close(zhp); if (prefixlen != NULL) *prefixlen = strlen(parent); return (0); } /* * Finds whether the dataset of the given type(s) exists. */ boolean_t zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* * Try to get stats for the dataset, which will tell us if it exists. */ if ((zhp = make_dataset_handle(hdl, path)) != NULL) { int ds_type = zhp->zfs_type; zfs_close(zhp); if (types & ds_type) return (B_TRUE); } return (B_FALSE); } /* * Given a path to 'target', create all the ancestors between * the prefixlen portion of the path, and the target itself. * Fail if the initial prefixlen-ancestor does not already exist. */ int create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) { zfs_handle_t *h; char *cp; const char *opname; /* make sure prefix exists */ cp = target + prefixlen; if (*cp != '/') { assert(strchr(cp, '/') == NULL); h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); } else { *cp = '\0'; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); *cp = '/'; } if (h == NULL) return (-1); zfs_close(h); /* * Attempt to create, mount, and share any ancestor filesystems, * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; h = make_dataset_handle(hdl, target); if (h) { /* it already exists, nothing to do here */ zfs_close(h); continue; } if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); goto ancestorerr; } if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; } if (zfs_share(h, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } zfs_close(h); } zfs_commit_shares(NULL); return (0); ancestorerr: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to %s ancestor '%s'"), opname, target); return (-1); } /* * Creates non-existing ancestors of the given path. */ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; char *path_copy; char errbuf[ERRBUFLEN]; int rc = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* * Check that we are not passing the nesting limit * before we start creating any ancestors. */ if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { rc = create_parents(hdl, path_copy, prefix); free(path_copy); } if (path_copy == NULL || rc != 0) return (-1); return (0); } /* * Create a new filesystem or volume. */ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); uint64_t zoned; enum lzc_dataset_type ost; zpool_handle_t *zpool_handle; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* validate parents exist */ if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) return (-1); /* * The failure modes when creating a dataset of a different type over * one that already exists is a little strange. In particular, if you * try to create a dataset on top of an existing dataset, the ioctl() * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) ost = LZC_DATSET_TYPE_ZVOL; else ost = LZC_DATSET_TYPE_ZFS; /* open zpool handle for prop validation */ char pool_path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(pool_path, path, sizeof (pool_path)); /* truncate pool_path at first slash */ char *p = strchr(pool_path, '/'); if (p != NULL) *p = '\0'; if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) return (-1); if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { zpool_close(zpool_handle); return (-1); } zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* * If we are creating a volume, the size and block size must * satisfy a few restraints. First, the blocksize must be a * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the * volsize must be a multiple of the block size, and cannot be * zero. */ if (props == NULL || nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if ((ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &blocksize)) != 0) { if (ret == ENOENT) { blocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); } else { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume block size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } if (size == 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size cannot be zero")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (size % blocksize != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size must be a multiple of volume block " "size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } (void) parent_name(path, parent, sizeof (parent)); if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, &wkeydata, &wkeylen) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } /* create the dataset */ ret = lzc_create(path, ost, props, wkeydata, wkeylen); nvlist_free(props); if (wkeydata != NULL) free(wkeydata); /* check for failure */ if (ret != 0) { switch (errno) { case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(hdl, EZFS_NOENT, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " "property or value")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption root's key is not loaded " "or provided")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ERANGE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property value(s) specified")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); #ifdef _ILP32 case EOVERFLOW: /* * This platform can't address a volume this big. */ if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); zfs_fallthrough; #endif default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (0); } /* * Destroys the given dataset. The caller must make sure that the filesystem * isn't mounted, and that there are no active dependents. If the file system * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { int error; if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) return (EINVAL); if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_bookmarks(nv, NULL); fnvlist_free(nv); if (error != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } return (0); } if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { error = lzc_destroy(zhp->zfs_name); } if (error != 0 && error != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } remove_mountpoint(zhp); return (0); } struct destroydata { nvlist_t *nvl; const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_boolean(dd->nvl, name); rv = zfs_iter_filesystems_v2(zhp, 0, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } /* * Destroys all snapshots with the given name in zhp & descendants. */ int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; dd.nvl = fnvlist_alloc(); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); } else { ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } fnvlist_free(dd.nvl); return (ret); } /* * Destroys all the snapshots named in the nvlist. */ int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { nvlist_t *errlist = NULL; nvpair_t *pair; int ret = zfs_destroy_snaps_nvl_os(hdl, snaps); if (ret != 0) return (ret); ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { nvlist_free(errlist); return (0); } if (nvlist_empty(errlist)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); ret = zfs_standard_error(hdl, ret, errbuf); } for (pair = nvlist_next_nvpair(errlist, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), nvpair_name(pair)); switch (fnvpair_value_int32(pair)) { case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot is cloned")); ret = zfs_error(hdl, EZFS_EXISTS, errbuf); break; case EBUSY: { nvlist_t *existing_holds; int err = lzc_get_holds(nvpair_name(pair), &existing_holds); /* check the presence of holders */ if (err == 0 && !nvlist_empty(existing_holds)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "it's being held. " "Run 'zfs holds -r %s' to see holders."), nvpair_name(pair)); ret = zfs_error(hdl, EBUSY, errbuf); } else { ret = zfs_standard_error(hdl, errno, errbuf); } if (err == 0) nvlist_free(existing_holds); break; } default: ret = zfs_standard_error(hdl, errno, errbuf); break; } } nvlist_free(errlist); return (ret); } /* * Clones the given dataset. The target must be of the same type as the source. */ int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) return (-1); (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ if (props) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; if (ZFS_IS_VOLUME(zhp)) type = ZFS_TYPE_VOLUME; if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) return (-1); if (zfs_fix_auto_resv(zhp, props) == -1) { nvlist_free(props); return (-1); } } if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } ret = lzc_clone(target, zhp->zfs_name, props); nvlist_free(props); if (ret != 0) { switch (errno) { case ENOENT: /* * The parent doesn't exist. We should have caught this * above, but there may a race condition that has since * destroyed the parent. * * At this point, we don't know whether it's the source * that doesn't exist anymore, or whether the target * dataset doesn't exist. */ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); case EXDEV: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "source and target pools differ")); return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, errbuf)); default: return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (ret); } /* * Promotes the given clone fs to be the clone parent. */ int zfs_promote(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; char snapname[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot promote '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be promoted")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (zhp->zfs_dmustats.dds_origin[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); if (ret != 0) { switch (ret) { case EACCES: /* * Promoting encrypted dataset outside its * encryption root. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot promote dataset outside its " "encryption root")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), snapname, zhp->zfs_dmustats.dds_origin); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, ret, errbuf)); } } return (ret); } typedef struct snapdata { nvlist_t *sd_nvl; const char *sd_snapname; } snapdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snapdata_t *sd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), sd->sd_snapname) >= sizeof (name)) return (EINVAL); fnvlist_add_boolean(sd->sd_nvl, name); rv = zfs_iter_filesystems_v2(zhp, 0, zfs_snapshot_cb, sd); } zfs_close(zhp); return (rv); } /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. */ int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { int ret; char errbuf[ERRBUFLEN]; nvpair_t *elem; nvlist_t *errors; zpool_handle_t *zpool_hdl; char pool[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshots ")); elem = NULL; while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { const char *snapname = nvpair_name(elem); /* validate the target name */ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, B_TRUE)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), snapname); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } /* * get pool handle for prop validation. assumes all snaps are in the * same pool, as does lzc_snapshot (below). */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (-1); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; zpool_hdl = zpool_open(hdl, pool); if (zpool_hdl == NULL) return (-1); if (props != NULL && (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { zpool_close(zpool_hdl); return (-1); } zpool_close(zpool_hdl); ret = lzc_snapshot(snaps, props, &errors); if (ret != 0) { boolean_t printed = B_FALSE; for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), nvpair_name(elem)); (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); printed = B_TRUE; } if (!printed) { switch (ret) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple snapshots of same " "fs not allowed")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } } nvlist_free(props); nvlist_free(errors); return (ret); } int zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, nvlist_t *props) { int ret; snapdata_t sd = { 0 }; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot snapshot %s"), path); if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strlcpy(fsname, path, sizeof (fsname)); cp = strchr(fsname, '@'); *cp = '\0'; sd.sd_snapname = cp + 1; if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { return (-1); } sd.sd_nvl = fnvlist_alloc(); if (recursive) { (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); } else { fnvlist_add_boolean(sd.sd_nvl, path); } ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); fnvlist_free(sd.sd_nvl); zfs_close(zhp); return (ret); } /* * Destroy any more recent snapshots. We invoke this callback on any dependents * of the snapshot first. If the 'cb_dependent' member is non-zero, then this * is a dependent and we should just destroy it without checking the transaction * group. */ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; boolean_t cb_force; } rollback_data_t; static int rollback_destroy_dependent(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; prop_changelist_t *clp; /* We must destroy this clone; first unmount it */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, cbp->cb_force ? MS_FORCE: 0); if (clp == NULL || changelist_prefix(clp) != 0) { cbp->cb_error = B_TRUE; zfs_close(zhp); return (0); } if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); (void) changelist_postfix(clp); changelist_free(clp); zfs_close(zhp); return (0); } static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { cbp->cb_error |= zfs_iter_dependents_v2(zhp, 0, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); return (0); } /* * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * * Any snapshots and bookmarks more recent than the target are * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; boolean_t restore_resv = 0; uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop = { 0 }; uint64_t min_txg = 0; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); if (cb.cb_create > 0) min_txg = cb.cb_create; (void) zfs_iter_snapshots_v2(zhp, 0, rollback_destroy, &cb, min_txg, 0); (void) zfs_iter_bookmarks_v2(zhp, 0, rollback_destroy, &cb); if (cb.cb_error) return (-1); /* * Now that we have verified that the snapshot is the latest, * rollback to the given snapshot. */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); restore_resv = (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } /* * Pass both the filesystem and the wanted snapshot names, * we would get an error back if the snapshot is destroyed or * a new snapshot is created before this request is processed. */ err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); if (err != 0) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); switch (err) { case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "there is a snapshot or bookmark more recent " "than '%s'"), snap->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); break; case ESRCH: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not found among snapshots of '%s'"), snap->zfs_name, zhp->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); break; case EINVAL: (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); } return (err); } /* * For volumes, if the pre-rollback volsize matched the pre- * rollback reservation and the volsize has changed then set * the reservation property to the post-rollback volsize. * Make a new handle since the rollback closed the dataset. */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) err = zfs_prop_set_int(zhp, resv_prop, new_volsize); } zfs_close(zhp); } return (err); } /* * Renames the given dataset. */ int zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; /* if we have the same exact name, just return success */ if (strcmp(zhp->zfs_name, target) == 0) return (0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); /* make sure source name is valid */ if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* * Make sure the target name is valid */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { if ((strchr(target, '@') == NULL) || *target == '@') { /* * Snapshot target name is abbreviated, * reconstruct full dataset name */ (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); delim = strchr(parent, '@'); if (strchr(target, '@') == NULL) *(++delim) = '\0'; else *delim = '\0'; (void) strlcat(parent, target, sizeof (parent)); target = parent; } else { /* * Make sure we're renaming within the same dataset. */ delim = strchr(target, '@'); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '@') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots must be part of same " "dataset")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents */ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "datasets must be within same pool")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } /* new name cannot be a child of the current dataset name */ if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); if (getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Avoid unmounting file systems with mountpoint property set to * 'legacy' or 'none' even if -u option is not given. */ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && !flags.recursive && !flags.nounmount && zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0)) { flags.nounmount = B_TRUE; } if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); delim = strchr(parentname, '@'); *delim = '\0'; zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; goto error; } zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); ret = -1; goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; } if (ZFS_IS_VOLUME(zhp)) zc.zc_objset_type = DMU_OST_ZVOL; else zc.zc_objset_type = DMU_OST_ZFS; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); zc.zc_cookie = !!flags.recursive; zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* * if it was recursive, the one that actually failed will * be in zc.zc_name */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot move encrypted child outside of " "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } /* * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ if (cl != NULL) (void) changelist_postfix(cl); } else { if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } (void) strlcpy(zhp->zfs_name, target, sizeof (zhp->zfs_name)); } error: if (cl != NULL) { changelist_free(cl); } return (ret); } nvlist_t * zfs_get_all_props(zfs_handle_t *zhp) { return (zhp->zfs_props); } nvlist_t * zfs_get_recvd_props(zfs_handle_t *zhp) { if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (NULL); return (zhp->zfs_recvd_props); } nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: * * - If this is a list of all properties, then expand the list to include * all native properties, and set a flag so that for each dataset we look * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen * so that we can size the column appropriately. If the user has * requested received property values, we also need to compute the width * of the RECEIVED column. */ int zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; zprop_list_t **last, **start; nvlist_t *userprops, *propval; nvpair_t *elem; const char *strval; char buf[ZFS_MAXPROPLEN]; if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) return (-1); userprops = zfs_get_user_props(zhp); entry = *plp; if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { /* * Go through and add any user properties as necessary. We * start by incrementing our list pointer to the first * non-native property. */ start = plp; while (*start != NULL) { if ((*start)->pl_prop == ZPROP_USERPROP) break; start = &(*start)->pl_next; } elem = NULL; while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { /* * See if we've already found this property in our list. */ for (last = start; *last != NULL; last = &(*last)->pl_next) { if (strcmp((*last)->pl_user_prop, nvpair_name(elem)) == 0) break; } if (*last == NULL) { entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_user_prop = zfs_strdup(hdl, nvpair_name(elem)); entry->pl_prop = ZPROP_USERPROP; entry->pl_width = strlen(nvpair_name(elem)); entry->pl_all = B_TRUE; *last = entry; } } } /* * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, entry->pl_prop, buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, &propval) == 0) { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); if (strlen(strval) > entry->pl_width) entry->pl_width = strlen(strval); } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } } return (0); } void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) { nvpair_t *curr; nvpair_t *next; /* * Keep a reference to the props-table against which we prune the * properties. */ zhp->zfs_props_table = props; curr = nvlist_next_nvpair(zhp->zfs_props, NULL); while (curr) { zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); next = nvlist_next_nvpair(zhp->zfs_props, curr); /* * User properties will result in ZPROP_USERPROP (an alias * for ZPROP_INVAL), and since we * only know how to prune standard ZFS properties, we always * leave these in the list. This can also happen if we * encounter an unknown DSL property (when running older * software, for example). */ if (zfs_prop != ZPROP_USERPROP && props[zfs_prop] == B_FALSE) (void) nvlist_remove(zhp->zfs_props, nvpair_name(curr), nvpair_type(curr)); curr = next; } } static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) { zfs_cmd_t zc = {"\0"}; nvlist_t *nvlist = NULL; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); zc.zc_cookie = (uint64_t)cmd; if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (0); } } switch (cmd) { case ZFS_SMB_ACL_ADD: case ZFS_SMB_ACL_REMOVE: (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); break; case ZFS_SMB_ACL_RENAME: if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, resource1) != 0) { (void) no_memory(hdl); return (-1); } if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, resource2) != 0) { (void) no_memory(hdl); return (-1); } zcmd_write_src_nvlist(hdl, &zc, nvlist); break; case ZFS_SMB_ACL_PURGE: break; default: return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); nvlist_free(nvlist); return (error); } int zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, resource, NULL)); } int zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, resource, NULL)); } int zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, NULL, NULL)); } int zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, char *oldname, char *newname) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = {"\0"}; zfs_useracct_t buf[100]; libzfs_handle_t *hdl = zhp->zfs_hdl; int ret; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA))) break; return (zfs_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get used/quota for %s"), zc.zc_name)); } if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { if ((ret = func(arg, zua->zu_domain, zua->zu_rid, zua->zu_space, zc.zc_guid)) != 0) return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } return (0); } struct holdarg { nvlist_t *nvl; const char *snapname; const char *tag; boolean_t recursive; int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) rv = zfs_iter_filesystems_v2(zhp, 0, zfs_hold_one, ha); zfs_close(zhp); return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { char errbuf[ERRBUFLEN]; fnvlist_free(ha.nvl); ret = ENOENT; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s@%s'"), zhp->zfs_name, snapname); (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); return (ret); } ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); return (ret); } int zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) { int ret; nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; nvpair_t *elem; errors = NULL; ret = lzc_hold(holds, cleanup_fd, &errors); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); switch (ret) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id * prepended. So even if we passed the length check * above, it's still possible for the tag to wind * up being slightly too long. */ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; nvlist_t *existing_holds; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) { ha->error = EINVAL; rv = EINVAL; } if (lzc_get_holds(name, &existing_holds) != 0) { ha->error = ENOENT; } else if (!nvlist_exists(existing_holds, ha->tag)) { ha->error = ESRCH; } else { nvlist_t *torelease = fnvlist_alloc(); fnvlist_add_boolean(torelease, ha->tag); fnvlist_add_nvlist(ha->nvl, name, torelease); fnvlist_free(torelease); } if (ha->recursive) rv = zfs_iter_filesystems_v2(zhp, 0, zfs_release_one, ha); zfs_close(zhp); return (rv); } int zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive) { int ret; struct holdarg ha; nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { fnvlist_free(ha.nvl); ret = ha.error; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s@%s'"), zhp->zfs_name, snapname); if (ret == ESRCH) { (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); } else { (void) zfs_standard_error(hdl, ret, errbuf); } return (ret); } ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zfs_standard_error(hdl, errno, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case ESRCH: (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } int zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; int nvsz = 2048; void *nvbuf; int err = 0; char errbuf[ERRBUFLEN]; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); tryagain: nvbuf = malloc(nvsz); if (nvbuf == NULL) { err = (zfs_error(hdl, EZFS_NOMEM, zfs_strerror(errno))); goto out; } zc.zc_nvlist_dst_size = nvsz; zc.zc_nvlist_dst = (uintptr_t)nvbuf; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); switch (errno) { case ENOMEM: free(nvbuf); nvsz = zc.zc_nvlist_dst_size; goto tryagain; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { err = zfs_standard_error_fmt(hdl, rc, dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); } } free(nvbuf); out: return (err); } int zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; char *nvbuf; char errbuf[ERRBUFLEN]; size_t nvsz; int err; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); assert(err == 0); nvbuf = malloc(nvsz); err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); assert(err == 0); zc.zc_nvlist_src_size = nvsz; zc.zc_nvlist_src = (uintptr_t)nvbuf; zc.zc_perm_action = un; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), zc.zc_name); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } free(nvbuf); return (err); } int zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { int err; char errbuf[ERRBUFLEN]; err = lzc_get_holds(zhp->zfs_name, nvl); if (err != 0) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), zhp->zfs_name); switch (err) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } return (err); } /* * The theory of raidz space accounting * * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block * will "reference" 128KB, even though it allocates more than that, to store the * parity information (and perhaps skip sectors). This concept of the * "referenced" (and other DMU space accounting) being lower than the allocated * space by a constant factor is called "raidz deflation." * * As mentioned above, the constant factor for raidz deflation assumes a 128KB * block size. However, zvols typically have a much smaller block size (default * 8KB). These smaller blocks may require proportionally much more parity * information (and perhaps skip sectors). In this case, the change to the * "referenced" property may be much more than the logical block size. * * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written * as follows. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D8 | D16 | D24 | * | P1 | D1 | D9 | D17 | D25 | * | P2 | D2 | D10 | D18 | D26 | * | P3 | D3 | D11 | D19 | D27 | * | P4 | D4 | D12 | D20 | D28 | * | P5 | D5 | D13 | D21 | D29 | * | P6 | D6 | D14 | D22 | D30 | * | P7 | D7 | D15 | D23 | D31 | * +-------+-------+-------+-------+-------+ * * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data * sectors. The dataset's referenced will increase by 128k and the pool's * allocated and free properties will be adjusted by 160k. * * A 4k block written to the same raidz vdev will require two 4k sectors. The * blank cells represent unallocated space. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | | | | * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another - * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated - * and free properties will be adjusted by 8k. The dataset will not be charged - * 8k. Rather, it will be charged a value that is scaled according to the - * overhead of the 128k block on the same vdev. This 8k allocation will be - * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as - * calculated in the 128k block example above. + * for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's + * allocated and free properties will be adjusted by 8k. The dataset will not + * be charged 8k. Rather, it will be charged a value that is scaled according + * to the overhead of the 128k block on the same vdev. This 8k allocation will + * be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is + * as calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 * allocations are a multiple of 3 sectors, and raidz3 allocations are a * multiple of of 4 sectors. When a block does not fill the required number of * sectors, skip blocks (sectors) are used. * * An 8k block being written to a raidz vdev may be written as follows: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | | * +-------+-------+-------+-------+-------+ * * In order to maintain the nparity+1 allocation size, a skip block (S0) was * added. For this 8k block, the pool's allocated and free properties are * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * * The situation is slightly different for dRAID since the minimum allocation * size is the full group width. The same 8K block above would be written as * follows in a dRAID group: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | S1 | * +-------+-------+-------+-------+-------+ * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. * Note that metadata blocks will typically be compressed, so the reservation * size returned by zvol_volsize_to_reservation() will generally be slightly * larger than the maximum that the volume can reference. */ /* * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. Note that the "referenced" space accounted will be less than this, but * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t -vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, +vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; ASSERT3U(ndisks, >, nparity); ndata = ndisks - nparity; asize = ((blksize - 1) >> ashift) + 1; asize += nparity * ((asize + ndata - 1) / ndata); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * Derived from function of same name in module/zfs/vdev_draid.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. */ static uint64_t -vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, +vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { ASSERT3U(ndisks, >, nparity); uint64_t ndata = ndisks - nparity; uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; uint64_t asize = (rows * ndisks) << ashift; return (asize); } /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. */ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (nblocks * blksize); } for (int v = 0; v < nvdevs; v++) { const char *type; uint64_t nparity, ashift, asize, tsize; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, &type) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && strcmp(type, VDEV_TYPE_DRAID) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, &nparity) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { nvlist_t **disks; uint_t ndisks; if (nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) continue; /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); + tsize = vdev_raidz_psize_to_asize(ndisks, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, - blksize); + asize = vdev_raidz_psize_to_asize(ndisks, nparity, + ashift, blksize); } else { uint64_t ndata; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) continue; /* allocation size for the "typical" 128k block */ - tsize = vdev_draid_asize(ndata + nparity, nparity, - ashift, SPA_OLD_MAXBLOCKSIZE); + tsize = vdev_draid_psize_to_asize(ndata + nparity, + nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ - asize = vdev_draid_asize(ndata + nparity, nparity, - ashift, blksize); + asize = vdev_draid_psize_to_asize(ndata + nparity, + nparity, ashift, blksize); } /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. * * Bitshift is to avoid the case of nblocks * asize < tsize * producing a size of 0. */ volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT); /* * If we would blow UINT64_MAX with this next multiplication, * don't. */ if (volsize > (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT))) volsize = UINT64_MAX; else volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (volsize > ret) { ret = volsize; } } if (ret == 0) { ret = nblocks * blksize; } return (ret); } /* * Convert the zvol's volume size to an appropriate reservation. See theory * comment above. * * Note: If this routine is updated, it is necessary to update the ZFS test * suite's shell version in reservation.shlib. */ uint64_t zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; int ncopies; const char *strval; if (nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) ncopies = atoi(strval); else ncopies = 1; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; nblocks = volsize / volblocksize; /* * Metadata defaults to using 128k blocks, not volblocksize blocks. For * this reason, only the data blocks are scaled based on vdev config. */ volsize = volsize_from_vdevs(zph, nblocks, volblocksize); /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ while (nblocks > 1) { nblocks += DNODES_PER_LEVEL - 1; nblocks /= DNODES_PER_LEVEL; numdb += nblocks; } numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); volsize *= ncopies; /* * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't * compressed, but in practice they compress down to about * 1100 bytes */ numdb *= 1ULL << DN_MAX_INDBLKSHIFT; volsize += numdb; return (volsize); } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent fses are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zfs's wait cmd). In that * scenario, a fs being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the fs doesn't * exist. */ int zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait_fs(zhp->zfs_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), zhp->zfs_name); } return (error); } diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 7acf37ba9cd7..b75d1ccea685 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1,1297 +1,1298 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef g_topology_locked #define g_topology_locked() sx_xlocked(&topology_lock) #endif /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; struct consumer_vdev_elem { SLIST_ENTRY(consumer_vdev_elem) elems; vdev_t *vd; }; SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); _Static_assert( sizeof (((struct g_consumer *)NULL)->private) == sizeof (struct consumer_priv_t *), "consumer_priv_t* can't be stored in g_consumer.private"); DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); /* Declare local functions */ static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); /* * Thread local storage used to indicate when a thread is probing geoms * for their guids. If NULL, this thread is not tasting geoms. If non NULL, * it is looking for a replacement for the vdev_t* that is its value. */ uint_t zfs_geom_probe_vdev_key; static void vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, boolean_t do_null_update) { boolean_t needs_update = B_FALSE; char *physpath; int error, physpath_len; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); if (error == 0) { char *old_physpath; /* g_topology lock ensures that vdev has not been closed */ g_topology_assert(); old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); if (old_physpath != NULL) { needs_update = (strcmp(old_physpath, vd->vdev_physpath) != 0); spa_strfree(old_physpath); } else needs_update = do_null_update; } g_free(physpath); /* * If the physical path changed, update the config. * Only request an update for previously unset physpaths if * requested by the caller. */ if (needs_update) spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) return; SLIST_FOREACH(elem, priv, elems) { vdev_t *vd = elem->vd; if (strcmp(attr, "GEOM::physpath") == 0) { vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); return; } } } static void vdev_geom_resize(struct g_consumer *cp) { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; spa_t *spa; vdev_t *vd; priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) return; SLIST_FOREACH(elem, priv, elems) { vd = elem->vd; if (vd->vdev_state != VDEV_STATE_HEALTHY) continue; spa = vd->vdev_spa; if (!spa->spa_autoexpand) continue; vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); } } static void vdev_geom_orphan(struct g_consumer *cp) { struct consumer_priv_t *priv; // cppcheck-suppress uninitvar struct consumer_vdev_elem *elem; g_topology_assert(); priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) /* Vdev close in progress. Ignore the event. */ return; /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ SLIST_FOREACH(elem, priv, elems) { // cppcheck-suppress uninitvar vdev_t *vd = elem->vd; vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) { struct g_geom *gp; struct g_consumer *cp; int error; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); if (sanity) { if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible sectorsize %d\n", pp->name, pp->sectorsize); return (NULL); } else if (pp->mediasize < SPA_MINDEVSIZE) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible mediasize %ju\n", pp->name, pp->mediasize); return (NULL); } } /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; gp->resize = vdev_geom_resize; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); return (NULL); } ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } if (vd != NULL) vd->vdev_tsd = cp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) { struct g_geom *gp; g_topology_assert(); ZFS_LOG(1, "Detaching from %s.", cp->provider && cp->provider->name ? cp->provider->name : "NULL"); gp = cp->geom; if (open_for_read) g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { ZFS_LOG(1, "Destroying consumer for %s.", cp->provider->name ? cp->provider->name : "NULL"); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void vdev_geom_close_locked(vdev_t *vd) { struct g_consumer *cp; struct consumer_priv_t *priv; struct consumer_vdev_elem *elem, *elem_temp; g_topology_assert(); cp = vd->vdev_tsd; vd->vdev_delayed_close = B_FALSE; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); priv = (struct consumer_priv_t *)&cp->private; vd->vdev_tsd = NULL; SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { if (elem->vd == vd) { SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); g_free(elem); } } vdev_geom_detach(cp, B_TRUE); } /* * Issue one or more bios to the vdev in parallel * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO * operation is described by parallel entries from each array. There may be * more bios actually issued than entries in the array */ static void vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, off_t *sizes, int *errors, int ncmds) { struct bio **bios; uint8_t *p; off_t off, maxio, s, end; int i, n_bios, j; size_t bios_size; maxio = maxphys - (maxphys % cp->provider->sectorsize); n_bios = 0; /* How many bios are required for all commands ? */ for (i = 0; i < ncmds; i++) n_bios += (sizes[i] + maxio - 1) / maxio; /* Allocate memory for the bios */ bios_size = n_bios * sizeof (struct bio *); bios = kmem_zalloc(bios_size, KM_SLEEP); /* Prepare and issue all of the bios */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; p = datas[i]; s = sizes[i]; end = off + s; ASSERT0(off % cp->provider->sectorsize); ASSERT0(s % cp->provider->sectorsize); for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { bios[j] = g_alloc_bio(); bios[j]->bio_cmd = cmds[i]; bios[j]->bio_done = NULL; bios[j]->bio_offset = off; bios[j]->bio_length = MIN(s, maxio); bios[j]->bio_data = (caddr_t)p; g_io_request(bios[j], cp); } } ASSERT3S(j, ==, n_bios); /* Wait for all of the bios to complete, and clean them up */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; s = sizes[i]; end = off + s; for (; off < end; off += maxio, s -= maxio, j++) { errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; g_destroy_bio(bios[j]); } } kmem_free(bios, bios_size); } /* * Read the vdev config from a device. Return the number of valid labels that * were found. The vdev config will be returned in config if and only if at * least one valid label was found. */ static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) { struct g_provider *pp; nvlist_t *config; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *buf; size_t buflen; uint64_t psize, state, txg; off_t offsets[VDEV_LABELS]; off_t size; off_t sizes[VDEV_LABELS]; int cmds[VDEV_LABELS]; int errors[VDEV_LABELS]; int l, nlabels; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); size = sizeof (*vdev_lists[0]) + pp->sectorsize - ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; buflen = sizeof (vdev_lists[0]->vp_nvlist); /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; vdev_lists[l] = kmem_alloc(size, KM_SLEEP); offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; sizes[l] = size; errors[l] = 0; ASSERT0(offsets[l] % pp->sectorsize); } /* Issue the IO requests */ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, VDEV_LABELS); /* Parse the labels */ config = *configp = NULL; nlabels = 0; for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) continue; buf = vdev_lists[l]->vp_nvlist; if (nvlist_unpack(buf, buflen, &config, 0) != 0) continue; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(config); continue; } if (*configp != NULL) nvlist_free(*configp); *configp = config; nlabels++; } /* Free the label storage */ for (l = 0; l < VDEV_LABELS; l++) kmem_free(vdev_lists[l], size); return (nlabels); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof (void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t *known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid; uint64_t id, txg, known_txg; const char *pname; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { known_txg = fnvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int nlabels; DROP_GIANT(); g_topology_lock(); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; zcp = vdev_geom_attach(pp, NULL, B_TRUE); if (zcp == NULL) continue; g_topology_unlock(); nlabels = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach(zcp, B_TRUE); if (nlabels == 0) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } enum match { NO_MATCH = 0, /* No matching labels found */ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ ZERO_MATCH = 1, /* Should never be returned */ ONE_MATCH = 2, /* 1 label matching the vdev_guid */ TWO_MATCH = 3, /* 2 label matching the vdev_guid */ THREE_MATCH = 4, /* 3 label matching the vdev_guid */ FULL_MATCH = 5 /* all labels match the vdev_guid */ }; static enum match vdev_attach_ok(vdev_t *vd, struct g_provider *pp) { nvlist_t *config; uint64_t pool_guid, top_guid, vdev_guid; struct g_consumer *cp; int nlabels; cp = vdev_geom_attach(pp, NULL, B_TRUE); if (cp == NULL) { ZFS_LOG(1, "Unable to attach tasting instance to %s.", pp->name); return (NO_MATCH); } g_topology_unlock(); nlabels = vdev_geom_read_config(cp, &config); g_topology_lock(); vdev_geom_detach(cp, B_TRUE); if (nlabels == 0) { ZFS_LOG(1, "Unable to read config from %s.", pp->name); return (NO_MATCH); } pool_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); top_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); vdev_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); nvlist_free(config); /* * Check that the label's pool guid matches the desired guid. * Inactive spares and L2ARCs do not have any pool guid in the label. */ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); return (NO_MATCH); } /* * Check that the label's vdev guid matches the desired guid. * The second condition handles possible race on vdev detach, when * remaining vdev receives GUID of destroyed top level mirror vdev. */ if (vdev_guid == vd->vdev_guid) { ZFS_LOG(1, "guids match for provider %s.", pp->name); return (ZERO_MATCH + nlabels); } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); return (TOPGUID_MATCH); } ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); return (NO_MATCH); } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp, *best_pp; struct g_consumer *cp; const char *vdpath; enum match match, best_match; g_topology_assert(); vdpath = vd->vdev_path + sizeof ("/dev/") - 1; cp = NULL; best_pp = NULL; best_match = NO_MATCH; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { match = vdev_attach_ok(vd, pp); if (match > best_match) { best_match = match; best_pp = pp; } else if (match == best_match) { if (strcmp(pp->name, vdpath) == 0) { best_pp = pp; } } if (match == FULL_MATCH) goto out; } } } out: if (best_pp) { cp = vdev_geom_attach(best_pp, vd, B_TRUE); if (cp == NULL) { printf("ZFS WARNING: Unable to attach to %s.\n", best_pp->name); } } return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guids [%ju:%ju].", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, cp->provider->name); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) cp = vdev_geom_attach(pp, vd, B_FALSE); } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; int error, has_trim; uint16_t rate; /* * Set the TLS to indicate downstack that we * should not access zvols */ VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd)); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if ((cp = vd->vdev_tsd) != NULL) { ASSERT(vd->vdev_reopening); goto skip_open; } DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_is_splitting || ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user really wants to do this, and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } /* Clear the TLS now that tasting is done */ VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL)); if (cp == NULL) { ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); error = ENOENT; } else { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; int spamode; priv = (struct consumer_priv_t *)&cp->private; if (cp->private == NULL) SLIST_INIT(priv); elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); elem->vd = vd; SLIST_INSERT_HEAD(priv, elem, elems); spamode = spa_mode(vd->vdev_spa); if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", cp->provider->name); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for " "writing (error=%d).\n", cp->provider->name, error); vdev_geom_close_locked(vd); cp = NULL; } } } /* Fetch initial physical path information for this device. */ if (cp != NULL) { vdev_geom_attrchanged(cp, "GEOM::physpath"); /* Set other GEOM characteristics */ vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); } g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", error); return (error); } skip_open: pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; /* Inform the ZIO pipeline that we are non-rotational. */ error = g_getattr("GEOM::rotation_rate", cp, &rate); if (error == 0 && rate == DISK_RR_NON_ROTATING) vd->vdev_nonrot = B_TRUE; else vd->vdev_nonrot = B_FALSE; /* Set when device reports it supports TRIM. */ error = g_getattr("GEOM::candelete", cp, &has_trim); vd->vdev_has_trim = (error == 0 && has_trim); /* Set when device reports it supports secure TRIM. */ /* unavailable on FreeBSD */ vd->vdev_has_securetrim = B_FALSE; return (0); } static void vdev_geom_close(vdev_t *vd) { struct g_consumer *cp; boolean_t locked; cp = vd->vdev_tsd; DROP_GIANT(); locked = g_topology_locked(); if (!locked) g_topology_lock(); if (!vd->vdev_reopening || (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || (cp->provider != NULL && cp->provider->error != 0)))) vdev_geom_close_locked(vd); if (!locked) g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch (zio->io_error) { case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } /* * We have to split bio freeing into two parts, because the ABD code * cannot be called in this context and vdev_op_io_done is not called * for ZIO_TYPE_FLUSH zio-s. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { g_destroy_bio(bp); zio->io_bio = NULL; } zio_delay_interrupt(zio); } struct vdev_geom_check_unmapped_cb_state { int pages; uint_t end; }; /* * Callback to check the ABD segment size/alignment and count the pages. * GEOM requires data buffer to look virtually contiguous. It means only * the first page of the buffer may not start and only the last may not * end on a page boundary. All other physical pages must be full. */ static int vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv) { struct vdev_geom_check_unmapped_cb_state *s = priv; vm_offset_t off = (vm_offset_t)buf & PAGE_MASK; if (s->pages != 0 && off != 0) return (1); if (s->end != 0) return (1); s->end = (off + len) & PAGE_MASK; s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT; return (0); } /* * Check whether we can use unmapped I/O for this ZIO on this device to * avoid data copying between scattered and/or gang ABD buffer and linear. */ static int vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp) { struct vdev_geom_check_unmapped_cb_state s; /* If unmapped I/O is administratively disabled, respect that. */ if (!unmapped_buf_allowed) return (0); /* If the buffer is already linear, then nothing to do here. */ if (abd_is_linear(zio->io_abd)) return (0); /* * If unmapped I/O is not supported by the GEOM provider, * then we can't do anything and have to copy the data. */ if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) return (0); /* Check the buffer chunks sizes/alignments and count pages. */ s.pages = s.end = 0; if (abd_iterate_func(zio->io_abd, 0, zio->io_size, vdev_geom_check_unmapped_cb, &s)) return (0); return (s.pages); } /* * Callback to translate the ABD segment into array of physical pages. */ static int vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) { struct bio *bp = priv; vm_offset_t addr = (vm_offset_t)buf; vm_offset_t end = addr + len; if (bp->bio_ma_n == 0) { bp->bio_ma_offset = addr & PAGE_MASK; addr &= ~PAGE_MASK; } else { ASSERT0(P2PHASE(addr, PAGE_SIZE)); } do { bp->bio_ma[bp->bio_ma_n++] = PHYS_TO_VM_PAGE(pmap_kextract(addr)); addr += PAGE_SIZE; } while (addr < end); return (0); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; vd = zio->io_vd; if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } if (zfs_nocacheflush || vdev_geom_bio_flush_disable) { zio_execute(zio); return; } if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); zio_execute(zio); return; } } else if (zio->io_type == ZIO_TYPE_TRIM) { if (vdev_geom_bio_delete_disable) { zio_execute(zio); return; } } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || zio->io_type == ZIO_TYPE_FLUSH); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; if (zio->io_type == ZIO_TYPE_READ) bp->bio_cmd = BIO_READ; else bp->bio_cmd = BIO_WRITE; /* * If possible, represent scattered and/or gang ABD buffer to * GEOM as an array of physical pages. It allows to satisfy * requirement of virtually contiguous buffer without copying. */ int pgs = vdev_geom_check_unmapped(zio, cp); if (pgs > 0) { bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs, M_DEVBUF, M_WAITOK); bp->bio_ma_n = 0; bp->bio_ma_offset = 0; abd_iterate_func(zio->io_abd, 0, zio->io_size, vdev_geom_fill_unmap_cb, bp); bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; } else { if (zio->io_type == ZIO_TYPE_READ) { bp->bio_data = abd_borrow_buf(zio->io_abd, zio->io_size); } else { bp->bio_data = abd_borrow_buf_copy(zio->io_abd, zio->io_size); } } break; case ZIO_TYPE_TRIM: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_FLUSH: bp->bio_cmd = BIO_FLUSH; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; default: panic("invalid zio->io_type: %d\n", zio->io_type); } bp->bio_done = vdev_geom_io_intr; zio->io_bio = bp; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { struct bio *bp = zio->io_bio; if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { ASSERT3P(bp, ==, NULL); return; } if (bp == NULL) { ASSERT3S(zio->io_error, ==, ENXIO); return; } if (bp->bio_ma != NULL) { free(bp->bio_ma, M_DEVBUF); } else { if (zio->io_type == ZIO_TYPE_READ) { abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); } else { abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); } } g_destroy_bio(bp); zio->io_bio = NULL; } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_geom_open, .vdev_op_close = vdev_geom_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_geom_io_start, .vdev_op_io_done = vdev_geom_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_geom_hold, .vdev_op_rele = vdev_geom_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 29e54b39aa1a..face4611d66c 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1,1652 +1,1653 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying * block_device. Since it carries the block_device inside, its convenient to * just use the handle as a proxy. * * Linux 6.9.x uses a file for the same purpose. * * For pre-6.8, we just emulate this with a cast, since we don't need any of * the other fields inside the handle. */ #if defined(HAVE_BDEV_OPEN_BY_PATH) typedef struct bdev_handle zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((bdh)->bdev) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) typedef struct file zfs_bdev_handle_t; #define BDH_BDEV(bdh) (file_bdev(bdh)) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) #else typedef void zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((struct block_device *)bdh) #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) #define BDH_ERR_PTR(err) (ERR_PTR(err)) #endif typedef struct vdev_disk { zfs_bdev_handle_t *vd_bdh; krwlock_t vd_lock; } vdev_disk_t; /* * Maximum number of segments to add to a bio (min 4). If this is higher than * the maximum allowed by the device queue or the kernel itself, it will be * clamped. Setting it to zero will cause the kernel's ideal size to be used. */ uint_t zfs_vdev_disk_max_segs = 0; /* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; /* * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* * BIO request failfast mask. */ static unsigned int zfs_vdev_failfast_mask = 1; /* * Convert SPA mode flags into bdev open mode flags. */ #ifdef HAVE_BLK_MODE_T typedef blk_mode_t vdev_bdev_mode_t; #define VDEV_BDEV_MODE_READ BLK_OPEN_READ #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) #else typedef fmode_t vdev_bdev_mode_t; #define VDEV_BDEV_MODE_READ FMODE_READ #define VDEV_BDEV_MODE_WRITE FMODE_WRITE #define VDEV_BDEV_MODE_EXCL FMODE_EXCL #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) #endif static vdev_bdev_mode_t vdev_bdev_mode(spa_mode_t smode) { ASSERT3U(smode, !=, SPA_MODE_UNINIT); ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; if (smode & SPA_MODE_READ) bmode |= VDEV_BDEV_MODE_READ; if (smode & SPA_MODE_WRITE) bmode |= VDEV_BDEV_MODE_WRITE; ASSERT(bmode & VDEV_BDEV_MODE_MASK); ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); return (bmode); } /* * Returns the usable capacity (in bytes) for the partition or disk. */ static uint64_t bdev_capacity(struct block_device *bdev) { #ifdef HAVE_BDEV_NR_BYTES return (bdev_nr_bytes(bdev)); #else return (i_size_read(bdev->bd_inode)); #endif } #if !defined(HAVE_BDEV_WHOLE) static inline struct block_device * bdev_whole(struct block_device *bdev) { return (bdev->bd_contains); } #endif #if defined(HAVE_BDEVNAME) #define vdev_bdevname(bdev, name) bdevname(bdev, name) #else static inline void vdev_bdevname(struct block_device *bdev, char *name) { snprintf(name, BDEVNAME_SIZE, "%pg", bdev); } #endif /* * Returns the maximum expansion capacity of the block device (in bytes). * * It is possible to expand a vdev when it has been created as a wholedisk * and the containing block device has increased in capacity. Or when the * partition containing the pool has been manually increased in size. * * This function is only responsible for calculating the potential expansion * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is * responsible for verifying the expected partition layout in the wholedisk * case, and updating the partition table if appropriate. Once the partition * size has been increased the additional capacity will be visible using * bdev_capacity(). * * The returned maximum expansion capacity is always expected to be larger, or * at the very least equal, to its usable capacity to prevent overestimating * the pool expandsize. */ static uint64_t bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) { uint64_t psize; int64_t available; if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to * alignment restrictions. Over reporting this value isn't * harmful and would only result in slightly less capacity * than expected post expansion. * The estimated available space may be slightly smaller than * bdev_capacity() for devices where the number of sectors is * not a multiple of the alignment size and the partition layout * is keeping less than PARTITION_END_ALIGNMENT bytes after the * "reserved" EFI partition: in such cases return the device * usable capacity. */ available = bdev_capacity(bdev_whole(bdev)) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); } else { psize = bdev_capacity(bdev); } return (psize); } static void vdev_disk_error(zio_t *zio) { /* * This function can be called in interrupt context, for instance while * handling IRQs coming from a misbehaving disk device; use printk() * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } static void vdev_disk_kobj_evt_post(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (vd && vd->vd_bdh) { spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); } else { vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", v->vdev_path); } } static zfs_bdev_handle_t * vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) { vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) return (bdev_file_open_by_path(path, bmode, holder, NULL)); #elif defined(HAVE_BDEV_OPEN_BY_PATH) return (bdev_open_by_path(path, bmode, holder, NULL)); #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) return (blkdev_get_by_path(path, bmode, holder, NULL)); #else return (blkdev_get_by_path(path, bmode, holder)); #endif } static void vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) { #if defined(HAVE_BDEV_RELEASE) return (bdev_release(bdh)); #elif defined(HAVE_BLKDEV_PUT_HOLDER) return (blkdev_put(BDH_BDEV(bdh), holder)); #elif defined(HAVE_BLKDEV_PUT) return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); #else fput(bdh); #endif } static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { zfs_bdev_handle_t *bdh; spa_mode_t smode = spa_mode(v->vdev_spa); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* * Reopen the device if it is currently open. When expanding a * partition force re-scanning the partition table if userland * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); bdh = vd->vd_bdh; vd->vd_bdh = NULL; if (bdh) { struct block_device *bdev = BDH_BDEV(bdh); if (v->vdev_expanding && bdev != bdev_whole(bdev)) { vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition * table already. We can detect this by * comparing our current physical size * with that of the device. If they are * the same, then we must not have * BLKPG_RESIZE_PARTITION or it failed to * update the partition table online. We * fallback to rescanning the partition * table from the kernel below. However, * if the capacity already reflects the * updated partition, then we skip * rescanning the partition table here. */ if (v->vdev_psize == bdev_capacity(bdev)) reread_part = B_TRUE; } vdev_blkdev_put(bdh, smode, zfs_vdev_holder); } if (reread_part) { bdh = vdev_blkdev_get_by_path(disk_name, smode, zfs_vdev_holder); if (!BDH_IS_ERR(bdh)) { int error = vdev_bdev_reread_part(BDH_BDEV(bdh)); vdev_blkdev_put(bdh, smode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); } } } } else { vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); rw_enter(&vd->vd_lock, RW_WRITER); } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerance to component failure. * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. * * The specified paths may be briefly removed and recreated in * response to udev events. This should be exceptionally unlikely * because the zpool command makes every effort to verify these paths * have already settled prior to reaching this point. Therefore, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. * * When ERESTARTSYS is returned it indicates the block device is * a zvol which could not be opened due to the deadlock detection * logic in zvol_open(). Extend the timeout and retry the open * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdh = BDH_ERR_PTR(-ENXIO); while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, zfs_vdev_holder); if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { /* * There is no point of waiting since device is removed * explicitly */ if (v->vdev_removed) break; schedule_timeout_interruptible(MSEC_TO_TICK(10)); } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; } else if (BDH_IS_ERR(bdh)) { break; } } if (BDH_IS_ERR(bdh)) { int error = -BDH_PTR_ERR(bdh); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); vd->vd_bdh = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { vd->vd_bdh = bdh; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } struct block_device *bdev = BDH_BDEV(vd->vd_bdh); /* Determine the physical block size */ int physical_block_size = bdev_physical_block_size(bdev); /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(bdev); /* * If the device has a write cache, clear the nowritecache flag, * so that we start issuing flush requests again. */ v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(bdev); /* Set when device reports it supports secure TRIM. */ v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(bdev); /* Physical volume size in bytes including possible expansion space */ *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; return (0); } static void vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } /* * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so * replace it with preempt_schedule under the following condition: */ #if defined(CONFIG_ARM64) && \ defined(CONFIG_PREEMPTION) && \ defined(CONFIG_BLK_CGROUP) #define preempt_schedule_notrace(x) preempt_schedule(x) #endif /* * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct * as an argument removing the need to set it with bio_set_dev(). This * removes the need for all of the following compatibility code. */ #if !defined(HAVE_BIO_ALLOC_4ARG) #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) /* * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). * * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public * part, moving blkg_tryget into the private one. Define our own version. */ #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { struct percpu_ref *ref = &blkg->refcnt; unsigned long __percpu *count; bool rc; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &count)) { this_cpu_inc(*count); rc = true; } else { #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA rc = atomic_long_inc_not_zero(&ref->data->count); #else rc = atomic_long_inc_not_zero(&ref->count); #endif } rcu_read_unlock_sched(); return (rc); } #else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting * the entire macro. Provide a minimal version which always assigns the * request queue's root_blkg to the bio. */ static inline void vdev_bio_associate_blkg(struct bio *bio) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg #else static inline void vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_THROTTLED); bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_set_dev vdev_bio_set_dev #endif #endif #endif /* !HAVE_BIO_ALLOC_4ARG */ static inline void vdev_submit_bio(struct bio *bio) { struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; (void) submit_bio(bio); current->bio_list = bio_list; } static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) { struct bio *bio; #ifdef HAVE_BIO_ALLOC_4ARG bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); #else bio = bio_alloc(gfp_mask, nr_vecs); if (likely(bio != NULL)) bio_set_dev(bio, bdev); #endif return (bio); } static inline uint_t vdev_bio_max_segs(struct block_device *bdev) { /* * Smallest of the device max segs and the tuneable max segs. Minimum * 4, so there's room to finish split pages if they come up. */ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(max_segs)); #else return (MIN(max_segs, BIO_MAX_PAGES)); #endif } static inline uint_t vdev_bio_max_bytes(struct block_device *bdev) { return (queue_max_sectors(bdev_get_queue(bdev)) << 9); } /* * Virtual block IO object (VBIO) * * Linux block IO (BIO) objects have a limit on how many data segments (pages) * they can hold. Depending on how they're allocated and structured, a large * ZIO can require more than one BIO to be submitted to the kernel, which then * all have to complete before we can return the completed ZIO back to ZFS. * * A VBIO is a wrapper around multiple BIOs, carrying everything needed to * translate a ZIO down into the kernel block layer and back again. * * Note that these are only used for data ZIOs (read/write). Meta-operations * (flush/trim) don't need multiple BIOs and so can just make the call * directly. */ typedef struct { zio_t *vbio_zio; /* parent zio */ struct block_device *vbio_bdev; /* blockdev to submit bios to */ abd_t *vbio_abd; /* abd carrying borrowed linear buf */ uint_t vbio_max_segs; /* max segs per bio */ uint_t vbio_max_bytes; /* max bytes per bio */ uint_t vbio_lbs_mask; /* logical block size mask */ uint64_t vbio_offset; /* start offset of next bio */ struct bio *vbio_bio; /* pointer to the current bio */ int vbio_flags; /* bio flags */ } vbio_t; static vbio_t * vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) { vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); vbio->vbio_zio = zio; vbio->vbio_bdev = bdev; vbio->vbio_abd = NULL; vbio->vbio_max_segs = vdev_bio_max_segs(bdev); vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); vbio->vbio_offset = zio->io_offset; vbio->vbio_bio = NULL; vbio->vbio_flags = flags; return (vbio); } static void vbio_completion(struct bio *bio); static int vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) { struct bio *bio = vbio->vbio_bio; uint_t ssize; while (size > 0) { if (bio == NULL) { /* New BIO, allocate and set up */ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, vbio->vbio_max_segs); VERIFY(bio); BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; bio_set_op_attrs(bio, vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? WRITE : READ, vbio->vbio_flags); if (vbio->vbio_bio) { bio_chain(vbio->vbio_bio, bio); vdev_submit_bio(vbio->vbio_bio); } vbio->vbio_bio = bio; } /* * Only load as much of the current page data as will fit in * the space left in the BIO, respecting lbs alignment. Older * kernels will error if we try to overfill the BIO, while * newer ones will accept it and split the BIO. This ensures * everything works on older kernels, and avoids an additional * overhead on the new. */ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & vbio->vbio_lbs_mask); if (ssize > 0 && bio_add_page(bio, page, ssize, offset) == ssize) { /* Accepted, adjust and load any remaining. */ size -= ssize; offset += ssize; continue; } /* No room, set up for a new BIO and loop */ vbio->vbio_offset += BIO_BI_SIZE(bio); /* Signal new BIO allocation wanted */ bio = NULL; } return (0); } /* Iterator callback to submit ABD pages to the vbio. */ static int vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) { vbio_t *vbio = priv; return (vbio_add_page(vbio, page, len, off)); } /* Create some BIOs, fill them with data and submit them */ static void vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) { /* * We plug so we can submit the BIOs as we go and only unplug them when * they are fully created and submitted. This is important; if we don't * plug, then the kernel may start executing earlier BIOs while we're * still creating and executing later ones, and if the device goes * away while that's happening, older kernels can get confused and * trample memory. */ struct blk_plug plug; blk_start_plug(&plug); (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); ASSERT(vbio->vbio_bio); vbio->vbio_bio->bi_end_io = vbio_completion; vbio->vbio_bio->bi_private = vbio; /* * Once submitted, vbio_bio now owns vbio (through bi_private) and we * can't touch it again. The bio may complete and vbio_completion() be * called and free the vbio before this task is run again, so we must * consider it invalid from this point. */ vdev_submit_bio(vbio->vbio_bio); blk_finish_plug(&plug); } /* IO completion callback */ static void vbio_completion(struct bio *bio) { vbio_t *vbio = bio->bi_private; zio_t *zio = vbio->vbio_zio; ASSERT(zio); /* Capture and log any errors */ zio->io_error = bi_status_to_errno(bio->bi_status); ASSERT3U(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); /* Return the BIO to the kernel */ bio_put(bio); /* * We're likely in an interrupt context so we can't do ABD/memory work * here; instead we stash vbio on the zio and take care of it in the * done callback. */ ASSERT3P(zio->io_bio, ==, NULL); zio->io_bio = vbio; zio_delay_interrupt(zio); } /* * Iterator callback to count ABD pages and check their size & alignment. * * On Linux, each BIO segment can take a page pointer, and an offset+length of * the data within that page. A page can be arbitrarily large ("compound" * pages) but we still have to ensure the data portion is correctly sized and * aligned to the logical block size, to ensure that if the kernel wants to * split the BIO, the two halves will still be properly aligned. * * NOTE: if you change this function, change the copy in * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test * data there to validate the change you're making. */ typedef struct { size_t blocksize; int seen_first; int seen_last; } vdev_disk_check_alignment_t; static int vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, void *priv) { (void) page; vdev_disk_check_alignment_t *s = priv; /* * The cardinal rule: a single on-disk block must never cross an * physical (order-0) page boundary, as the kernel expects to be able * to split at both LBS and page boundaries. * * This implies various alignment rules for the blocks in this * (possibly compound) page, which we can check for. */ /* * If the previous page did not end on a page boundary, then we * can't proceed without creating a hole. */ if (s->seen_last) return (1); /* This page must contain only whole LBS-sized blocks. */ if (!IS_P2ALIGNED(len, s->blocksize)) return (1); /* * If this is not the first page in the ABD, then the data must start * on a page-aligned boundary (so the kernel can split on page * boundaries without having to deal with a hole). If it is, then * it can start on LBS-alignment. */ if (s->seen_first) { if (!IS_P2ALIGNED(off, PAGESIZE)) return (1); } else { if (!IS_P2ALIGNED(off, s->blocksize)) return (1); s->seen_first = 1; } /* * If this data does not end on a page-aligned boundary, then this * must be the last page in the ABD, for the same reason. */ s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); return (0); } /* * Check if we can submit the pages in this ABD to the kernel as-is. Returns * the number of pages, or 0 if it can't be submitted like this. */ static boolean_t vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev) { vdev_disk_check_alignment_t s = { .blocksize = bdev_logical_block_size(bdev), }; if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_alignment_cb, &s)) return (B_FALSE); return (B_TRUE); } static int vdev_disk_io_rw(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; struct block_device *bdev = BDH_BDEV(vd->vd_bdh); int flags = 0; /* * Accessing outside the block device is never allowed. */ if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, (u_longlong_t)bdev_capacity(bdev)); return (SET_ERROR(EIO)); } if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && v->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } /* * Check alignment of the incoming ABD. If any part of it would require * submitting a page that is not aligned to both the logical block size * and the page size, then we take a copy into a new memory region with * correct alignment. This should be impossible on a 512b LBS. On * larger blocks, this can happen at least when a small number of * blocks (usually 1) are allocated from a shared slab, or when * abnormally-small data regions (eg gang headers) are mixed into the * same ABD as larger allocations (eg aggregations). */ abd_t *abd = zio->io_abd; if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) { /* Allocate a new memory region with guaranteed alignment */ abd = abd_alloc_for_io(zio->io_size, zio->io_abd->abd_flags & ABD_FLAG_META); /* If we're writing copy our data into it */ if (zio->io_type == ZIO_TYPE_WRITE) abd_copy(abd, zio->io_abd, zio->io_size); /* * False here would mean the new allocation has an invalid * alignment too, which would mean that abd_alloc() is not * guaranteeing this, or our logic in * vdev_disk_check_alignment() is wrong. In either case, * something in seriously wrong and its not safe to continue. */ VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev)); } /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ vbio_t *vbio = vbio_alloc(zio, bdev, flags); if (abd != zio->io_abd) vbio->vbio_abd = abd; /* Fill it with data pages and submit it to the kernel */ vbio_submit(vbio, abd, zio->io_size); return (0); } /* ========== */ /* * This is the classic, battle-tested BIO submission code. Until we're totally * sure that the new code is safe and correct in all cases, this will remain * available and can be enabled by setting zfs_vdev_disk_classic=1 at module * load time. * * These functions have been renamed to vdev_classic_* to make it clear what * they belong to, but their implementations are unchanged. */ /* * Virtual device vector for disks. */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ struct bio *dr_bio[]; /* Attached bio's */ } dio_request_t; static dio_request_t * vdev_classic_dio_alloc(int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; for (int i = 0; i < dr->dr_bio_count; i++) dr->dr_bio[i] = NULL; return (dr); } static void vdev_classic_dio_free(dio_request_t *dr) { int i; for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) bio_put(dr->dr_bio[i]); kmem_free(dr, sizeof (dio_request_t) + sizeof (struct bio *) * dr->dr_bio_count); } static void vdev_classic_dio_get(dio_request_t *dr) { atomic_inc(&dr->dr_ref); } static void vdev_classic_dio_put(dio_request_t *dr) { int rc = atomic_dec_return(&dr->dr_ref); /* * Free the dio_request when the last reference is dropped and * ensure zio_interpret is called only once with the correct zio */ if (rc == 0) { zio_t *zio = dr->dr_zio; int error = dr->dr_error; vdev_classic_dio_free(dr); if (zio) { zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_delay_interrupt(zio); } } } static void vdev_classic_physio_completion(struct bio *bio) { dio_request_t *dr = bio->bi_private; if (dr->dr_error == 0) { dr->dr_error = bi_status_to_errno(bio->bi_status); } /* Drop reference acquired by vdev_classic_physio */ vdev_classic_dio_put(dr); } static inline unsigned int vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(nr_segs)); #else return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int vdev_classic_physio(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; struct block_device *bdev = BDH_BDEV(vd->vd_bdh); size_t io_size = zio->io_size; uint64_t io_offset = zio->io_offset; int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; int flags = 0; dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; int bio_size; int bio_count = 16; int error = 0; struct blk_plug plug; unsigned short nr_vecs; /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev_capacity(bdev)) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)io_offset, (u_longlong_t)io_size, (u_longlong_t)bdev_capacity(bdev)); return (SET_ERROR(EIO)); } retry: dr = vdev_classic_dio_alloc(bio_count); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } dr->dr_zio = zio; /* * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio * can cover at least 128KB and at most 1MB. When the required number * of iovec's exceeds this, we are forced to break the IO in multiple * bio's and wait for them all to complete. This is likely if the * recordsize property is increased beyond 1MB. The default * bio_count=16 should typically accommodate the maximum-size zio of * 16MB. */ abd_offset = 0; bio_offset = io_offset; bio_size = io_size; for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* * If additional bio's are required, we have to retry, but * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_classic_dio_free(dr); bio_count *= 2; goto retry; } nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_classic_dio_free(dr); return (SET_ERROR(ENOMEM)); } /* Matching put called by vdev_classic_physio_completion */ vdev_classic_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_classic_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); vdev_classic_dio_put(dr); return (error); } /* ========== */ static void vdev_disk_io_flush_completion(struct bio *bio) { zio_t *zio = bio->bi_private; zio->io_error = bi_status_to_errno(bio->bi_status); if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY) zio->io_error = SET_ERROR(ENOTSUP); bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } static int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; struct bio *bio; q = bdev_get_queue(bdev); if (!q) return (SET_ERROR(ENXIO)); bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); if (unlikely(bio == NULL)) return (SET_ERROR(ENOMEM)); bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); return (0); } static void vdev_disk_discard_end_io(struct bio *bio) { zio_t *zio = bio->bi_private; zio->io_error = bi_status_to_errno(bio->bi_status); bio_put(bio); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } /* * Wrappers for the different secure erase and discard APIs. We use async * when available; in this case, *biop is set to the last bio in the chain. */ static int vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, sector_t nsect, struct bio **biop) { *biop = NULL; int error; #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) error = blkdev_issue_secure_erase(BDH_BDEV(bdh), sector, nsect, GFP_NOFS); #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) error = __blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) error = blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); #else #error "unsupported kernel" #endif return (error); } static int vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, sector_t nsect, struct bio **biop) { *biop = NULL; int error; #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) error = __blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS, 0, biop); #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) error = __blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS, biop); #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) error = blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS, 0); #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) error = blkdev_issue_discard(BDH_BDEV(bdh), sector, nsect, GFP_NOFS); #else #error "unsupported kernel" #endif return (error); } /* * Entry point for TRIM ops. This calls the right wrapper for secure erase or * discard, and then does the appropriate finishing work for error vs success * and async vs sync. */ static int vdev_disk_io_trim(zio_t *zio) { int error; struct bio *bio; zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; sector_t sector = zio->io_offset >> 9; sector_t nsects = zio->io_size >> 9; if (zio->io_trim_flags & ZIO_TRIM_SECURE) error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); else error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); if (error != 0) return (SET_ERROR(-error)); if (bio == NULL) { /* * This was a synchronous op that completed successfully, so * return it to ZFS immediately. */ zio_interrupt(zio); } else { /* * This was an asynchronous op; set up completion callback and * issue it. */ bio->bi_private = zio; bio->bi_end_io = vdev_disk_discard_end_io; vdev_submit_bio(bio); } return (0); } int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vd == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } rw_enter(&vd->vd_lock, RW_READER); /* * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ if (vd->vd_bdh == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_FLUSH: if (!vdev_readable(v)) { /* Drive not there, can't flush */ error = SET_ERROR(ENXIO); } else if (zfs_nocacheflush) { /* Flushing disabled by operator, declare success */ error = 0; } else if (v->vdev_nowritecache) { /* This vdev not capable of flushing */ error = SET_ERROR(ENOTSUP); } else { /* * Issue the flush. If successful, the response will * be handled in the completion callback, so we're done. */ error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } } /* Couldn't issue the flush, so set the error and return it */ rw_exit(&vd->vd_lock); zio->io_error = error; zio_execute(zio); return; case ZIO_TYPE_TRIM: error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_execute(zio); } return; case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); error = vdev_disk_io_rw_fn(zio); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_interrupt(zio); } return; default: /* * Getting here means our parent vdev has made a very strange * request of us, and shouldn't happen. Assert here to force a * crash in dev builds, but in production return the IO * unhandled. The pool will likely suspend anyway but that's * nicer than crashing the kernel. */ ASSERT3S(zio->io_type, ==, -1); rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } __builtin_unreachable(); } static void vdev_disk_io_done(zio_t *zio) { /* If this was a read or write, we need to clean up the vbio */ if (zio->io_bio != NULL) { vbio_t *vbio = zio->io_bio; zio->io_bio = NULL; /* * If we copied the ABD before issuing it, clean up and return * the copy to the ADB, with changes if appropriate. */ if (vbio->vbio_abd != NULL) { if (zio->io_type == ZIO_TYPE_READ) abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size); abd_free(vbio->vbio_abd); vbio->vbio_abd = NULL; } /* Final cleanup */ kmem_free(vbio, sizeof (vbio_t)); } /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous * removal of the device from the configuration. */ if (zio->io_error == EIO) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { invalidate_bdev(BDH_BDEV(vd->vd_bdh)); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } } } static void vdev_disk_hold(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* XXX: Implement me as a vnode rele for the device */ } /* * BIO submission method. See comment above about vdev_classic. * Set zfs_vdev_disk_classic=0 for new, =1 for classic */ static uint_t zfs_vdev_disk_classic = 0; /* default new */ /* Set submission function from module parameter */ static int vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) { int err = param_set_uint(buf, kp); if (err < 0) return (SET_ERROR(err)); vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; printk(KERN_INFO "ZFS: forcing %s BIO submission\n", zfs_vdev_disk_classic ? "classic" : "new"); return (0); } /* * At first use vdev use, set the submission function from the default value if * it hasn't been set already. */ static int vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) { (void) spa; (void) nv; (void) tsd; if (vdev_disk_io_rw_fn == NULL) vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; return (0); } vdev_ops_t vdev_disk_ops = { .vdev_op_init = vdev_disk_init, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, + .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_disk_hold, .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE, /* leaf vdev */ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* * The zfs_vdev_scheduler module option has been deprecated. Setting this * value no longer has any effect. It has not yet been entirely removed * to allow the module to be loaded if this option is specified in the * /etc/modprobe.d/zfs.conf file. The following warning will be logged. */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { int error = param_set_charp(val, kp); if (error == 0) { printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " "is not supported.\n"); } return (error); } static const char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, "Timeout before determining that a device is missing"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, "Maximum number of data segments to add to an IO request (min 4)"); ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, "Use classic BIO submission method"); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 8e7138984e2e..58f0975e166f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1,6285 +1,6395 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab group's per child vdev granularity, in bytes. This is roughly * similar to what would be referred to as the "stripe size" in traditional * RAID arrays. In normal operation, we will try to write this amount of * data to each disk before moving on to the next top-level vdev. */ static uint64_t metaslab_aliquot = 2 * 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * Of blocks of size >= metaslab_force_ganging, actually gang them this often. */ uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmentation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ static uint_t zfs_metaslab_fragmentation_threshold = 77; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this * bucket, and therefore a lower weight). */ static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; kstat_named_t metaslabstat_reload_tree; kstat_named_t metaslabstat_too_many_tries; kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, { "too_many_tries", KSTAT_DATA_UINT64 }, { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (metaslab_ksp != NULL) { metaslab_ksp->ks_data = &metaslab_stats; kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { if (metaslab_ksp != NULL) { kstat_delete(metaslab_ksp); metaslab_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops, boolean_t is_log) { metaslab_class_t *mc; mc = kmem_zalloc(offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; mc->mc_ops = ops; mc->mc_is_log = is_log; mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; mc->mc_alloc_max = UINT64_MAX; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mca->mca_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); mca->mca_rotor = NULL; mca->mca_reserved = 0; } return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; avl_destroy(&mca->mca_tree); mutex_destroy(&mca->mca_lock); ASSERT(mca->mca_rotor == NULL); ASSERT0(mca->mca_reserved); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count])); } void metaslab_class_validate(metaslab_class_t *mc) { #ifdef ZFS_DEBUG spa_t *spa = mc->mc_spa; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) || spa_config_held(spa, SCL_ALL, RW_WRITER)); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; metaslab_group_t *mg, *rotor; ASSERT0(avl_numnodes(&mca->mca_tree)); ASSERT0(mca->mca_reserved); if ((mg = rotor = mca->mca_rotor) == NULL) continue; do { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; vdev_t *vd = mg->mg_vd; ASSERT3P(vd->vdev_top, ==, vd); ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); ASSERT0(zfs_refcount_count(&mga->mga_queue_depth)); } while ((mg = mg->mg_next) != rotor); } #endif } /* * For each metaslab group in a class pre-calculate allocation quota and * target queue depth to balance their space usage and write performance. * Based on those pre-calculate class allocation throttle threshold for * optimal saturation. onsync is true once per TXG to enable/disable * allocation throttling and update moving average of maximum I/O size. */ void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync) { metaslab_group_t *mg, *first; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if (onsync) metaslab_class_validate(mc); if (mc->mc_groups == 0) { if (onsync) mc->mc_alloc_throttle_enabled = B_FALSE; mc->mc_alloc_max = UINT64_MAX; return; } if (onsync) { /* * Moving average of maximum allocation size, in absence of * large allocations shrinking to 1/8 of metaslab_aliquot. */ mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size + metaslab_aliquot / 8) / 4; mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 : zio_dva_throttle_enabled; } mg = first = mc->mc_allocator[0].mca_rotor; uint64_t children = 0; do { children += vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd); } while ((mg = mg->mg_next) != first); uint64_t sum_aliquot = 0; do { vdev_stat_t *vs = &mg->mg_vd->vdev_stat; uint_t ratio; /* * Scale allocations per iteration with average number of * children. Wider vdevs need more sequential allocations * to keep decent per-child I/O size. */ uint64_t mg_aliquot = MAX(metaslab_aliquot * children / mc->mc_groups, mc->mc_alloc_io_size * 4); /* * Scale allocations per iteration with the vdev capacity, * relative to average. Bigger vdevs should get more to * fill up at the same time as smaller ones. */ if (mc->mc_space > 0 && vs->vs_space > 0) { ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups * 256) + 1); mg_aliquot = mg_aliquot * ratio / 256; } /* * Scale allocations per iteration with the vdev's free space * fraction, relative to average. Despite the above, vdevs free * space fractions may get imbalanced, for example due to new * vdev addition or different performance. We want free space * fractions to be similar to postpone fragmentation. * * But same time we don't want to throttle vdevs still having * plenty of free space, that appear faster than others, even * if that cause temporary imbalance. Allow them to allocate * more by keeping their allocation queue depth equivalent to * 2.5 full iteration, even if they repeatedly drain it. Later * with the free space reduction gradually reduce the target * queue depth, stronger enforcing the free space balance. */ if (metaslab_bias_enabled && mc->mc_space > 0 && vs->vs_space > 0) { uint64_t vs_free = vs->vs_space > vs->vs_alloc ? vs->vs_space - vs->vs_alloc : 0; uint64_t mc_free = mc->mc_space > mc->mc_alloc ? mc->mc_space - mc->mc_alloc : 0; /* * vs_fr is 16 bit fixed-point free space fraction. * mc_fr is 8 bit fixed-point free space fraction. * ratio as their quotient is 8 bit fixed-point. */ uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1); uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1); ratio = vs_fr / (mc_fr + 1); mg->mg_aliquot = mg_aliquot * ratio / 256; /* From 2.5x at 25% full to 1x at 75%. */ ratio = MIN(163840, vs_fr * 3 + 16384); mg->mg_queue_target = MAX(mg->mg_aliquot, mg->mg_aliquot * ratio / 65536); } else { mg->mg_aliquot = mg_aliquot; mg->mg_queue_target = mg->mg_aliquot * 2; } sum_aliquot += mg->mg_aliquot; } while ((mg = mg->mg_next) != first); /* * Set per-class allocation throttle threshold to 4 iterations through * all the vdevs. This should keep all vdevs busy even if some are * allocating more than we planned for them due to bigger blocks or * better performance. */ mc->mc_alloc_max = sum_aliquot * 4; } static void metaslab_class_rotate(metaslab_group_t *mg, int allocator, uint64_t psize, boolean_t success) { metaslab_class_t *mc = mg->mg_class; metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; /* * Exit fast if there is nothing to rotate, we are not following * the rotor (copies, gangs, etc) or somebody already rotated it. */ if (mc->mc_groups < 2 || mca->mca_rotor != mg) return; /* * Always rotate in case of allocation error or a log class. */ if (!success || mc->mc_is_log) goto rotate; /* * Allocate from this group if we expect next I/O of the same size to * mostly fit within the allocation quota. Rotate if we expect it to * mostly go over the target queue depth. Meanwhile, to stripe between * groups in configured amounts per child even if we can't reach the * target queue depth, i.e. can't saturate the group write performance, * always rotate after allocating the queue target bytes. */ uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2; if (naq < mg->mg_aliquot) return; if (naq >= mg->mg_queue_target) goto rotate; if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >= mg->mg_queue_target) goto rotate; /* * When the pool is not too busy, prefer restoring the vdev free space * balance instead of getting maximum speed we might not need, so that * we could have more flexibility during more busy times later. */ if (metaslab_perf_bias <= 0) goto rotate; if (metaslab_perf_bias >= 2) return; spa_t *spa = mc->mc_spa; dsl_pool_t *dp = spa_get_dsl(spa); if (dp == NULL) return; uint64_t busy_thresh = zfs_dirty_data_max * (zfs_vdev_async_write_active_min_dirty_percent + zfs_vdev_async_write_active_max_dirty_percent) / 200; if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa)) return; rotate: mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; uint64_t now = gethrestime_sec(); /* Round delay up to next second. */ uint_t delay = (metaslab_unload_delay_ms + 999) / 1000; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { mutex_enter(&msp->ms_lock); /* * If the metaslab has been removed from the list * (which could happen if we were at the memory limit * and it was evicted during this loop), then we can't * proceed and we should restart the sublist. */ if (!multilist_link_active(&msp->ms_class_txg_node)) { mutex_exit(&msp->ms_lock); i--; break; } mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && now > msp->ms_selected_time + delay && (msp->ms_allocator == -1 || !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* * Once we've hit a metaslab selected too * recently to evict, we're done evicting for * now. */ mutex_exit(&msp->ms_lock); break; } mutex_exit(&msp->ms_lock); msp = next_msp; } } } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { spa_t *spa = mc->mc_spa; metaslab_group_t *mg; mg = kmem_zalloc(offsetof(metaslab_group_t, mg_allocator[spa->spa_alloc_count]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_queue_depth); } return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[spa->spa_alloc_count])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } for (int i = 0; i < spa->spa_alloc_count; i++) { mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } metaslab_class_balance(mc, B_FALSE); } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mgnext = NULL; } else { mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } for (int i = 0; i < spa->spa_alloc_count; i++) { if (mc->mc_allocator[i].mca_rotor == mg) mc->mc_allocator[i].mca_rotor = mgnext; } mg->mg_prev = NULL; mg->mg_next = NULL; metaslab_class_balance(mc, B_FALSE); } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { /* * Note that the number of nodes in mg_metaslab_tree may be one less * than vdev_ms_count, due to the embedded log metaslab. */ mutex_enter(&mg->mg_lock); uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); mutex_exit(&mg->mg_lock); return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); for (metaslab_t *msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { VERIFY3P(msp->ms_group, ==, mg); /* skip if not active */ if (msp->ms_sm == NULL) continue; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } } for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. Weight metaslabs * on the amount of free space. The return value will be between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0, total_ms = 0; uint64_t free, total_free = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_group != mg) continue; total_ms++; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; valid_ms++; free = (msp->ms_size - metaslab_allocated_space(msp)) / SPA_MINBLOCKSIZE; /* To prevent overflows. */ total_free += free; fragmentation += msp->ms_fragmentation * free; } if (valid_ms < (total_ms + 1) / 2 || total_free == 0) return (ZFS_FRAG_INVALID); fragmentation /= total_free; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { const zfs_range_seg32_t *r1 = x1; const zfs_range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { const zfs_range_seg64_t *r1 = x1; const zfs_range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; struct mssa_arg { zfs_range_tree_t *rt; metaslab_rt_arg_t *mra; }; static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; zfs_range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; zfs_range_seg_max_t seg = {0}; zfs_rs_set_start(&seg, rt, start); zfs_rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void metaslab_size_tree_full_load(zfs_range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg); } ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, zfs_range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, zfs_range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered zfs_range_tree_t and an array of * uint64_t's. */ static void metaslab_rt_create(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { case ZFS_RANGE_SEG32: size = sizeof (zfs_range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case ZFS_RANGE_SEG64: size = sizeof (zfs_range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } static void metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } static void metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } static void metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } static const zfs_range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, .rtop_remove = metaslab_rt_remove, .rtop_vacate = metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; zfs_range_seg_t *rs; if (t == NULL) return (0); if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, msp->ms_allocatable)); } /* * Return the maximum contiguous segment within the unflushed frees of this * metaslab. */ static uint64_t metaslab_largest_unflushed_free(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_unflushed_frees == NULL) return (0); if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); /* * When a range is freed from the metaslab, that range is added to * both the unflushed frees and the deferred frees. While the block * will eventually be usable, if the metaslab were loaded the range * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE * txgs had passed. As a result, when attempting to estimate an upper * bound for the largest currently-usable free segment in the * metaslab, we need to not consider any ranges currently in the defer * trees. This algorithm approximates the largest available chunk in * the largest range in the unflushed_frees tree by taking the first * chunk. While this may be a poor estimate, it should only remain so * briefly and should eventually self-correct as frees are no longer * deferred. Similar logic applies to the ms_freed tree. See * metaslab_load() for more details. * * There are two primary sources of inaccuracy in this estimate. Both * are tolerated for performance reasons. The first source is that we * only check the largest segment for overlaps. Smaller segments may * have more favorable overlaps with the other trees, resulting in * larger usable chunks. Second, we only look at the first chunk in * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); rsize = start - rstart; } } uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; return (rsize); } static zfs_range_seg_t * metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, - uint64_t size, zfs_btree_index_t *where) + uint64_t size, uint64_t max_size, zfs_btree_index_t *where) { zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); - zfs_rs_set_end(&rsearch, rt, start + size); + zfs_rs_set_end(&rsearch, rt, start + max_size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = zfs_btree_next(t, where, where); + if (size == max_size) { + rs = zfs_btree_next(t, where, where); + } else { + /* + * If we're searching for a range, get the largest + * segment in that range, or the smallest one bigger + * than it. + */ + rs = zfs_btree_prev(t, where, where); + if (rs == NULL || zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt) < size) { + rs = zfs_btree_next(t, where, where); + } + } } return (rs); } /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking * for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, - uint64_t max_search) + uint64_t max_size, uint64_t max_search, uint64_t *found_size) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, - &where); + max_size, &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) first_found = zfs_rs_get_start(rs, rt); while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = zfs_rs_get_start(rs, rt); if (offset + size <= zfs_rs_get_end(rs, rt)) { - *cursor = offset + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; return (offset); } rs = zfs_btree_next(bt, &where, &where); count_searched++; } *cursor = 0; + *found_size = 0; return (-1ULL); } -static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); +static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { { "dynamic", metaslab_df_alloc }, { "cursor", metaslab_cf_alloc }, { "new-dynamic", metaslab_ndf_alloc }, }; static int spa_find_allocator_byname(const char *val) { int a = ARRAY_SIZE(metaslab_allocators) - 1; if (strcmp("new-dynamic", val) == 0) return (-1); /* remove when ndf is working */ for (; a >= 0; a--) { if (strcmp(val, metaslab_allocators[a].msop_name) == 0) return (a); } return (-1); } void spa_set_allocator(spa_t *spa, const char *allocator) { int a = spa_find_allocator_byname(allocator); if (a < 0) a = 0; spa->spa_active_allocator = a; zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name); } int spa_get_allocator(spa_t *spa) { return (spa->spa_active_allocator); } #if defined(_KERNEL) int param_set_active_allocator_common(const char *val) { char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; int a = spa_find_allocator_byname(val); if (a < 0) return (SET_ERROR(EINVAL)); zfs_active_allocator = metaslab_allocators[a].msop_name; return (0); } #endif metaslab_ops_t * metaslab_allocator(spa_t *spa) { int allocator = spa_get_allocator(spa); return (&metaslab_allocators[allocator]); } /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t -metaslab_df_alloc(metaslab_t *msp, uint64_t size) +metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ - uint64_t align = size & -size; + uint64_t align = max_size & -max_size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; zfs_range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; offset = -1; } else { - offset = metaslab_block_picker(rt, - cursor, size, metaslab_df_max_search); + offset = metaslab_block_picker(rt, cursor, size, max_size, + metaslab_df_max_search, found_size); + if (max_size != size && offset == -1) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; + offset = metaslab_block_picker(rt, cursor, size, + max_size, metaslab_df_max_search, found_size); + } } if (offset == -1) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, - rt, msp->ms_start, size, &where); + rt, msp->ms_start, size, max_size, &where); } if (rs != NULL && zfs_rs_get_start(rs, rt) + size <= zfs_rs_get_end(rs, rt)) { offset = zfs_rs_get_start(rs, rt); - *cursor = offset + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; } } return (offset); } /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t -metaslab_cf_alloc(metaslab_t *msp, uint64_t size) +metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) return (-1ULL); *cursor = zfs_rs_get_start(rs, rt); *cursor_end = zfs_rs_get_end(rs, rt); } offset = *cursor; - *cursor += size; + *found_size = MIN(*cursor_end - offset, max_size); + *cursor = offset + *found_size; return (offset); } /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t -metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; - uint64_t hbit = highbit64(size); + uint64_t hbit = highbit64(max_size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_largest_allocatable(msp); + uint64_t max_possible_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (max_size < size) + if (max_possible_size < size) return (-1ULL); zfs_rs_set_start(&rsearch, rt, *cursor); - zfs_rs_set_end(&rsearch, rt, *cursor + size); + zfs_rs_set_end(&rsearch, rt, *cursor + max_size); rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < + max_size) { + hbit = highbit64(size); + cursor = &msp->ms_lbas[hbit - 1]; + zfs_rs_set_start(&rsearch, rt, *cursor); + zfs_rs_set_end(&rsearch, rt, *cursor + size); + + rs = zfs_btree_find(t, &rsearch, &where); + } if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; zfs_rs_set_start(&rsearch, rt, 0); - zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + - metaslab_ndf_clump_shift))); + zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size, + 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { - *cursor = zfs_rs_get_start(rs, rt) + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt), max_size); + *cursor = zfs_rs_get_start(rs, rt) + *found_size; return (zfs_rs_get_start(rs, rt)); } return (-1ULL); } /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ static void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } static unsigned int metaslab_idx_func(multilist_t *ml, void *arg) { metaslab_t *msp = arg; /* * ms_id values are allocated sequentially, so full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, zfs_range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + zfs_range_tree_space(msp->ms_unflushed_allocs) - zfs_range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += zfs_range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, zfs_range_tree_space(msp->ms_defer[0]) + zfs_range_tree_space(msp->ms_defer[1])); msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + zfs_range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, zfs_range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { zfs_range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; /* * This function is used for verification purposes and thus should * not introduce any side-effects/mutations on the system's state. * * Regardless of whether metaslab_weight() thinks this metaslab * should be active or not, we want to ensure that the actual weight * (and therefore the value of ms_weight) would be the same if it * was to be recalculated at this point. * * In addition we set the nodirty flag so metaslab_weight() does * not dirty the metaslab for future TXGs (e.g. when trying to * force condensing to upgrade the metaslab spacemaps). */ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } /* * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from * this class that was used longest ago, and attempt to unload it. We don't * want to spend too much time in this loop to prevent performance * degradation, and we expect that most of the time this operation will * succeed. Between that and the normal unloading processing during txg sync, * we expect this to keep the metaslab memory usage under control. */ static void metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); break; } metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); /* * If the metaslab is currently loading there are two * cases. If it's the metaslab we're evicting, we * can't continue on or we'll panic when we attempt to * recursively lock the mutex. If it's another * metaslab that's loading, it can be safely skipped, * since we know it's very new and therefore not a * good eviction candidate. We check later once the * lock is held that the metaslab is fully loaded * before actually unloading it. */ if (msp->ms_loading) { msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); continue; } /* * We can't unload metaslabs with no spacemap because * they're not ready to be unloaded yet. We can't * unload metaslabs with outstanding allocations * because doing so could cause the metaslab's weight * to decrease while it's unloaded, which violates an * invariant that we use to prevent unnecessary * loading. We also don't unload metaslabs that are * currently active because they are high-weight * metaslabs that are likely to be used in the near * future. */ mutex_enter(&msp->ms_lock); if (msp->ms_allocator == -1 && msp->ms_sm != NULL && msp->ms_allocating_total == 0) { metaslab_unload(msp); } mutex_exit(&msp->ms_lock); msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } #else (void) mc, (void) zfs_metaslab_mem_limit; #endif } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); metaslab_rt_arg_t *mrap; if (msp->ms_allocatable->rt_arg == NULL) { mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); } else { mrap = msp->ms_allocatable->rt_arg; msp->ms_allocatable->rt_ops = NULL; msp->ms_allocatable->rt_arg = NULL; } mrap->mra_bt = &msp->ms_allocatable_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); /* Now, populate the size-sorted tree. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; zfs_range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load * the metaslab from the spacemap. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ zfs_range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. */ ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_remove, msp->ms_allocatable); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); msp->ms_load_time = load_end; zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, unloaded time %llu ms, " "loading_time %lld ms, ms_max_size %llu, " "max size error %lld, " "old_weight %llx, new_weight %llx", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)zfs_range_tree_space(msp->ms_freed), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, (u_longlong_t)msp->ms_max_size - max_size, (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); /* * If we're loading a metaslab in the normal class, consider evicting * another one to keep our memory usage under the limit defined by the * zfs_metaslab_mem_limit tunable. */ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == msp->ms_group->mg_class) { metaslab_potentially_evict(msp->ms_group->mg_class); } int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * This can happen if a metaslab is selected for eviction (in * metaslab_potentially_evict) and then unloaded during spa_sync (via * metaslab_class_evict_old). */ if (!msp->ms_loaded) return; zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, weight %llx, " "selected txg %llu (%llu s ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) - msp->ms_selected_time), (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, (u_longlong_t)msp->ms_max_size); } /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } /* * We want to optimize the memory use of the per-metaslab range * trees. To do this, we store the segments in the range trees in * units of sectors, zero-indexing from the start of the metaslab. If * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; return (ZFS_RANGE_SEG32); } else { *shift = 0; *start = 0; return (ZFS_RANGE_SEG64); } } void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; msp->ms_selected_time = gethrestime_sec(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; vdev_ops_t *ops = vd->vdev_ops; if (ops->vdev_op_metaslab_init != NULL) ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. For * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_SIZE; t++) { ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), metaslab_unflushed_dirty(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); /* * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); zfs_range_tree_destroy(msp->ms_allocatable); zfs_range_tree_destroy(msp->ms_freeing); zfs_range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_allocs); zfs_range_tree_destroy(msp->ms_checkpointing); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { zfs_range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); zfs_range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. * * This table defines 0% fragmented space using 512M segments. Using this value, * we derive the rest of the table. This table originally went up to 16MB, but * with larger recordsizes, larger ashifts, and use of raidz3, it is possible * to have significantly larger allocations than were previously possible. * Since the fragmentation value is never stored on disk, it is possible to * change these calculations in the future. */ static const int zfs_frag_table[] = { 100, /* 512B */ 99, /* 1K */ 97, /* 2K */ 93, /* 4K */ 88, /* 8K */ 83, /* 16K */ 77, /* 32K */ 71, /* 64K */ 64, /* 128K */ 57, /* 256K */ 50, /* 512K */ 43, /* 1M */ 36, /* 2M */ 29, /* 4M */ 23, /* 8M */ 17, /* 16M */ 12, /* 32M */ 7, /* 64M */ 3, /* 128M */ 1, /* 256M */ 0, /* 512M */ }; #define FRAGMENTATION_TABLE_SIZE \ (sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0]))) /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. We also skip marking this metaslab for * condensing if the caller has explicitly set nodirty. */ if (!nodirty && spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* * This case will usually but not always get caught by the checks below; * metaslabs can be loaded by various means, including the trim and * initialize code. Once that happens, without this check they are * allocatable even before they finish their first txg sync. */ if (unlikely(msp->ms_new)) return (B_FALSE); /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in * try_hard and it's been less than zfs_metaslab_max_size_cache_sec * seconds since the metaslab was unloaded. */ if (msp->ms_loaded || (msp->ms_max_size != 0 && !try_hard && gethrtime() < msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_set_fragmentation(msp, nodirty); /* * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. If the metaslab is * unloaded, we check if there's a larger free segment in the * unflushed frees. This is a lower bound on the largest allocatable * segment size. Coalescing of adjacent entries may reveal larger * allocatable segments, but we aren't aware of those until loading * the space map into a range tree. */ if (msp->ms_loaded) { msp->ms_max_size = metaslab_largest_allocatable(msp); } else { msp->ms_max_size = MAX(msp->ms_max_size, metaslab_largest_unflushed_free(msp)); } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) { ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(mg, msp, msp->ms_weight | activation_weight); return (0); } metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? &mga->mga_primary : &mga->mga_secondary); mutex_enter(&mg->mg_lock); if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort_impl(mg, msp, msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_class->mc_spa->spa_alloc_count); metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mga->mga_primary = NULL; } else { ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * As long as a single largest free segment covers majorioty of free * space, don't consider the metaslab fragmented. It should allow * us to fill new unfragmented metaslabs full before switching. */ if (metaslab_largest_allocatable(msp) > zfs_range_tree_space(msp->ms_allocatable) * 15 / 16) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; metaslab_class_t *mc = msp->ms_group->mg_class; spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, msp, TQ_SLEEP | (m <= spa->spa_alloc_count ? TQ_FRONT : 0)) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { zfs_range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% empty. * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain freed * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to add to * the condense_tree. Instead we first sync out an entry marking * everything as allocated, then the condense_tree and then the * ms_allocatable, in the condensed space map. While this is not * optimal, it is typically close to optimal and more importantly * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; zfs_range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], zfs_range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); zfs_range_tree_vacate(condense_tree, NULL, NULL); zfs_range_tree_destroy(condense_tree); zfs_range_tree_vacate(tmp_tree, NULL, NULL); zfs_range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } static void metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, B_TRUE); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa, B_TRUE); } void metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* update log space map summary */ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, ms_prev_flushed_dirty); spa_log_summary_add_flushed_metaslab(spa, dirty); /* cleanup obsolete logs if any */ spa_cleanup_old_sm_logs(spa, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || metaslab_unflushed_txg(msp) == 0) return; metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { ASSERT0(zfs_range_tree_space(alloctree)); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); ASSERT0(zfs_range_tree_space(msp->ms_trim)); return; } /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense, it's loaded and we're not beyond the final * dirty txg, we need to let it through. Not condensing beyond the * final dirty txg prevents an issue where metaslabs that need to be * condensed but were loaded for other reasons could cause a panic * here. By only checking the txg in that branch of the conditional, * we preserve the utility of the VERIFY statements in all other * cases. */ if (zfs_range_tree_is_empty(alloctree) && zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); if (metaslab_unflushed_txg(msp) == 0) metaslab_unflushed_add(msp, tx); else if (!metaslab_unflushed_dirty(msp)) metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); zfs_range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += zfs_range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, zfs_range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += zfs_range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += zfs_range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { zfs_range_tree_vacate(msp->ms_freeing, zfs_range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); zfs_range_tree_vacate(alloctree, NULL, NULL); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } static void metaslab_evict(metaslab_t *msp, uint64_t txg) { if (!msp->ms_loaded || msp->ms_disabled != 0) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(zfs_range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); if (!metaslab_debug_unload) metaslab_unload(msp); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; zfs_range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); if (msp->ms_new) { /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); VERIFY0(zfs_range_tree_space(msp->ms_freed)); } ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - zfs_range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = zfs_range_tree_space(msp->ms_freed) - zfs_range_tree_space(*defer_tree); } else { defer_delta -= zfs_range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { zfs_range_tree_walk(*defer_tree, zfs_range_tree_add, msp->ms_trim); if (!defer_allowed) { zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, msp->ms_trim); } } else { zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ zfs_range_tree_vacate(*defer_tree, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { zfs_range_tree_swap(&msp->ms_freed, defer_tree); } else { zfs_range_tree_vacate(msp->ms_freed, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); mg->mg_fragmentation = metaslab_group_fragmentation(mg); metaslab_group_alloc_update(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } +void +metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new) +{ + ASSERT0(new->zal_size); + list_move_tail(&new->zal_list, &old->zal_list); + new->zal_size = old->zal_size; + list_destroy(&old->zal_list); +} + void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC)) + if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); } +void +metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator, + int flags, uint64_t psize, const void *tag) +{ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); + metaslab_group_alloc_increment(spa, vdev, allocator, flags, + psize, tag); + } +} + void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC)) + if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag); } static uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t txg, uint64_t *actual_size) { uint64_t start; zfs_range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); - start = mc->mc_ops->msop_alloc(msp, size); + start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); if (start != -1ULL) { + size = *actual_size; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); zfs_range_tree_remove(rt, start, size); zfs_range_tree_clear(msp->ms_trim, start, size); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!try_hard && tries > zfs_metaslab_find_max_tries) { METASLABSTAT_BUMP(metaslabstat_too_many_tries); return (NULL); } tries++; if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, or * hasn't gone through a metaslab_sync_done(), then skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; if (!try_hard) { for (i = 0; i < d; i++) { if (!metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } static void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator, - boolean_t try_hard) + uint64_t asize, uint64_t max_asize, uint64_t txg, + dva_t *dva, int d, int allocator, boolean_t try_hard, + uint64_t *actual_asize) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active, we just use the 0th slot. */ if (allocator >= mg->mg_ms_ready / 3) allocator = 0; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mga->mga_primary != NULL) { msp = mga->mga_primary; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus it is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mga->mga_secondary != NULL) { msp = mga->mga_secondary; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, asize, allocator, try_hard, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) break; mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } metaslab_set_selected_txg(msp, txg); int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } - offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); + offset = metaslab_block_alloc(msp, asize, max_asize, txg, + actual_asize); if (offset != -1ULL) { + metaslab_trace_add(zal, mg, msp, *actual_asize, d, + offset, allocator); /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); mutex_exit(&msp->ms_lock); break; } + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } kmem_free(search, sizeof (*search)); if (offset == -1ULL) { metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize <= vdev_get_min_alloc(mg->mg_vd)) { /* * This metaslab group was unable to allocate * the minimum block size so it must be out of * space. Notify the allocation throttle to * skip allocation attempts to this group until * more space becomes available. */ mg->mg_no_free_space = B_TRUE; } } return (offset); } static boolean_t metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize, int d, int flags, boolean_t try_hard, zio_alloc_list_t *zal, int allocator) { metaslab_class_t *mc = mg->mg_class; vdev_t *vd = mg->mg_vd; boolean_t allocatable; /* * Don't allocate from faulted devices. */ if (try_hard) spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); if (try_hard) spa_config_exit(spa, SCL_ZIO, FTAG); if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); return (B_FALSE); } if (!try_hard) { /* * Avoid vdevs with too little space or too fragmented. */ if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space || (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); return (B_FALSE); } /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev. */ if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); return (B_FALSE); } } return (B_TRUE); } -/* - * Allocate a block for the specified i/o. - */ -int -metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal, int allocator) +static int +metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, + int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg = NULL, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } + if (max_psize > psize && max_psize >= metaslab_force_ganging && + metaslab_force_ganging_pct > 0 && + (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { + max_psize = MAX((psize + max_psize) / 2, + metaslab_force_ganging); + } + ASSERT3U(psize, <=, max_psize); /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. It makes our * fault domains something tractable. */ if (hintdva && DVA_IS_VALID(&hintdva[d])) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); mg = vdev_get_mg(vd, mc); } if (mg == NULL && d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vdev_get_mg(vd, mc)->mg_next; } if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } rotor = mg; top: do { ASSERT(mg->mg_activation_count == 1); ASSERT(mg->mg_class == mc); if (!metaslab_group_allocatable(spa, mg, psize, d, flags, try_hard, zal, allocator)) goto next; vd = mg->mg_vd; uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); - ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - dva, d, allocator, try_hard); + ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, + txg); + ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); + uint64_t offset = metaslab_group_alloc(mg, zal, asize, + max_asize, txg, dva, d, allocator, try_hard, + &asize); if (offset != -1ULL) { + if (actual_psize) + *actual_psize = vdev_asize_to_psize_txg(vd, + asize, txg); metaslab_class_rotate(mg, allocator, psize, B_TRUE); DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: metaslab_class_rotate(mg, allocator, psize, B_FALSE); } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, perhaps do so now. */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || psize <= spa->spa_min_alloc)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal, int allocator) +{ + return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva, + txg, flags, zal, allocator, NULL)); +} + void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int m = offset >> vd->vdev_ms_shift; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); VERIFY3U(m, <, vd->vdev_ms_count); msp = vd->vdev_ms[m]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); zfs_range_tree_add(msp->ms_checkpointing, offset, asize); } else { zfs_range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Cloned blocks can not be remapped since BRT depends on specific * vdev id and offset in the DVA[0] for its reference counting. */ if (!BP_IS_METADATA(bp) && brt_maybe_exists(spa, bp)) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); zfs_range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, boolean_t must, boolean_t *more) { metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); if (mc->mc_alloc_io_size < zio->io_size) { mc->mc_alloc_io_size = zio->io_size; metaslab_class_balance(mc, B_FALSE); } if (must || mca->mca_reserved <= mc->mc_alloc_max) { /* * The potential race between compare and add is covered by the * allocator lock in most cases, or irrelevant due to must set. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. */ int64_t delta = slots * zio->io_size; *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= mc->mc_alloc_max); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } *more = B_FALSE; return (B_FALSE); } boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) { metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); int64_t delta = slots * zio->io_size; return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= mc->mc_alloc_max); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); zfs_range_tree_remove(msp->ms_allocatable, offset, size); zfs_range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); } multilist_sublist_unlock(mls); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, const void *tag) +{ + return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg, + hintbp, flags, zal, allocator, tag, NULL)); +} + +int +metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg, + blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, + const void *tag, uint64_t *actual_psize) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); + uint64_t cur_psize = 0; + for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal, allocator); + error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, + dva, d, hintdva, txg, flags, zal, allocator, + actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); + if (actual_psize) + max_psize = MIN(cur_psize, max_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); + if (actual_psize) + *actual_psize = max_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner, (void) arg; if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { zfs_range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the * current allocating tree. */ zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, size); zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); if (unload) metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) { ms->ms_unflushed_dirty = dirty; } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } boolean_t metaslab_unflushed_dirty(metaslab_t *ms) { return (ms->ms_unflushed_dirty); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, "Load all metaslabs when pool is first opened"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, "Prevent metaslabs from being unloaded"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, "Max number of metaslabs per group to preload"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, "Enable space-based metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW, "Enable performance-based metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are sometimes forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 9cd0cfbcf8c2..aa2902d0b84e 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -1,358 +1,359 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include #include #ifdef ZFS_DEBUG /* * Reference count tracking is disabled by default. It's memory requirements * are reasonable, however as implemented it consumes a significant amount of * cpu time. Until its performance is improved it should be manually enabled. */ int reference_tracking_enable = B_FALSE; static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); } static int zfs_refcount_compare(const void *x1, const void *x2) { const reference_t *r1 = (const reference_t *)x1; const reference_t *r2 = (const reference_t *)x2; int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); int cmp = cmp1 ? cmp1 : cmp2; return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; } void zfs_refcount_create_tracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_TRUE; } void zfs_refcount_create_untracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_FALSE; } void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); avl_destroy(&rc->rc_tree); while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } void zfs_refcount_destroy(zfs_refcount_t *rc) { zfs_refcount_destroy_many(rc, 0); } int zfs_refcount_is_zero(zfs_refcount_t *rc) { return (zfs_refcount_count(rc) == 0); } int64_t zfs_refcount_count(zfs_refcount_t *rc) { return (atomic_load_64(&rc->rc_count)); } int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref; int64_t count; if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); } ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_add(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_add_many(rc, 1, holder)); } void zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { if (likely(!rc->rc_tracked)) (void) zfs_refcount_add_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_add(rc, holder); } int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref, s; int64_t count; if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } s.ref_holder = holder; s.ref_number = number; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); ref = avl_find(&rc->rc_tree, &s, NULL); if (unlikely(ref == NULL)) { - panic("No such hold %p on refcount %llx", holder, + PANIC("No such hold %llx on refcount %llx", + (u_longlong_t)(uintptr_t)holder, (u_longlong_t)(uintptr_t)rc); return (-1); } avl_remove(&rc->rc_tree, ref); if (reference_history > 0) { list_insert_head(&rc->rc_removed, ref); if (rc->rc_removed_count >= reference_history) { ref = list_remove_tail(&rc->rc_removed); kmem_cache_free(reference_cache, ref); } else { rc->rc_removed_count++; } } else { kmem_cache_free(reference_cache, ref); } rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_remove_many(rc, 1, holder)); } void zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { if (likely(!rc->rc_tracked)) (void) zfs_refcount_remove_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_remove(rc, holder); } void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { avl_tree_t tree; list_t removed; reference_t *ref; void *cookie = NULL; uint64_t count; uint_t removed_count; avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; if (avl_is_empty(&dst->rc_tree)) avl_swap(&dst->rc_tree, &tree); else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); avl_destroy(&tree); list_destroy(&removed); } void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { reference_t *ref, s; if (likely(!rc->rc_tracked)) return; s.ref_holder = current_holder; s.ref_number = number; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ref = avl_find(&rc->rc_tree, &s, NULL); ASSERT(ref); ref->ref_holder = new_holder; avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } void zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, const void *new_holder) { return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, new_holder)); } /* * If tracking is enabled, return true if a reference exists that matches * the "holder" tag. If tracking is disabled, then return true if a reference * might be held. */ boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref, s; avl_index_t idx; boolean_t res; if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); s.ref_holder = holder; s.ref_number = 0; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ref = avl_find(&rc->rc_tree, &s, &idx); if (likely(ref == NULL)) ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); return (res); } /* * If tracking is enabled, return true if a reference does not exist that * matches the "holder" tag. If tracking is disabled, always return true * since the reference might not be held. */ boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref, s; avl_index_t idx; boolean_t res; if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); s.ref_holder = holder; s.ref_number = 0; s.ref_search = B_TRUE; ref = avl_find(&rc->rc_tree, &s, &idx); if (likely(ref == NULL)) ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); return (res); } EXPORT_SYMBOL(zfs_refcount_create); EXPORT_SYMBOL(zfs_refcount_destroy); EXPORT_SYMBOL(zfs_refcount_is_zero); EXPORT_SYMBOL(zfs_refcount_count); EXPORT_SYMBOL(zfs_refcount_add); EXPORT_SYMBOL(zfs_refcount_remove); EXPORT_SYMBOL(zfs_refcount_held); ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW, "Maximum reference holders being tracked"); #endif /* ZFS_DEBUG */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9ac9a9fe608c..4fab60336078 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6609 +1,6627 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } +uint64_t +vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift)); + uint64_t csize, psize = asize; + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg); + psize = MIN(psize, csize); + } + + return (psize); +} + /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_dio_verify_rl, &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); zfs_range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } zfs_range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; zfs_range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; if (spa->spa_gcd_alloc == INT_MAX) { spa->spa_gcd_alloc = min_alloc; } else { spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } } } void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add) { spa_t *spa = vd->vdev_spa; if (vd->vdev_mg->mg_class != spa_normal_class(spa)) return; uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg); uint64_t dspace = spa_deflate(spa) ? vdev_deflated_space(vd, raw_space) : raw_space; if (add) { spa->spa_nonallocating_dspace += dspace; } else { ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace); spa->spa_nonallocating_dspace -= dspace; } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ vdev_update_nonallocating_space(vd, B_TRUE); } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); /* * If this probe was initiated from zio pipeline, then * change the state in a spa_async_request. Probes that * were initiated from a vdev_open can change the state * as part of the open call. */ if (vps->vps_zio_done_probe) { vd->vdev_fault_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); } } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) taskq_wait(tq); for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) taskq_destroy(tq); } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; vd->vdev_remove_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_contains(rt, txg, size)) zfs_range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(rt)) dirty = zfs_range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = zfs_range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ static void vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess_impl(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!zfs_range_tree_is_empty( vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev * as if they are not available. It's more likely than not that * a vdev in a replacing vdev could encounter read errors so * treat it as not being able to contribute. */ if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, rebuild_done, B_FALSE)); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } zfs_range_tree_vacate(rt, NULL, NULL); zfs_range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); zfs_range_tree_vacate(rtsync, NULL, NULL); zfs_range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be * - offlined * - detached * - removed * - faulted * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, &vd->vdev_slow_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, &vd->vdev_slow_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } +uint64_t +vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg)); +} /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See - * vdev_raidz_asize(). + * vdev_raidz_psize_to_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { - return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); + return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ zfs_range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { zfs_range_seg64_t iter_rs = *logical_rs; zfs_range_seg64_t physical_rs; zfs_range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; case VDEV_PROP_SLOW_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_n = intval; break; case VDEV_PROP_SLOW_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_TRIM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_trim_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_SLOW_IOS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_slow_ios, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_has_trim, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za->za_name; switch (za->za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl, param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW, "RAIDZ implementation"); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index d39a05458fe7..e0fafd0da2d9 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1,2822 +1,2825 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_draid_io_verify() */ #endif /* * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is * comprised of multiple raidz redundancy groups which are spread over the * dRAID children. To ensure an even distribution, and avoid hot spots, a * permutation mapping is applied to the order of the dRAID children. * This mixing effectively distributes the parity columns evenly over all * of the disks in the dRAID. * * This is beneficial because it means when resilvering all of the disks * can participate thereby increasing the available IOPs and bandwidth. * Furthermore, by reserving a small fraction of each child's total capacity * virtual distributed spare disks can be created. These spares similarly * benefit from the performance gains of spanning all of the children. The * consequence of which is that resilvering to a distributed spare can * substantially reduce the time required to restore full parity to pool * with a failed disks. * * === dRAID group layout === * * First, let's define a "row" in the configuration to be a 16M chunk from * each physical drive at the same offset. This is the minimum allowable * size since it must be possible to store a full 16M block when there is * only a single data column. Next, we define a "group" to be a set of * sequential disks containing both the parity and data columns. We allow * groups to span multiple rows in order to align any group size to any * number of physical drives. Finally, a "slice" is comprised of the rows * which contain the target number of groups. The permutation mappings * are applied in a round robin fashion to each slice. * * Given D+P drives in a group (including parity drives) and C-S physical * drives (not including the spare drives), we can distribute the groups * across R rows without remainder by selecting the least common multiple * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). * * In the example below, there are C=14 physical drives in the configuration * with S=2 drives worth of spare capacity. Each group has a width of 9 * which includes D=8 data and P=1 parity drive. There are 4 groups and * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice * size is 576M (144M * 4). When allocating from a dRAID each group is * filled before moving on to the next as show in slice0 below. * * data disks (8 data + 1 parity) spares (2) * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | | group 0 | group 1..| | * | +-----------------------------------+-----------+-------| * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 * s +-----------------------+-----------------------+-------+ * l | ..group 1 | group 2.. | | * i +-----------------------+-----------------------+-------+ * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 * | +-----------+-----------+-----------------------+-------+ * | |..group 2 | group 3 | | * | +-----------+-----------+-----------------------+-------+ * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * l | group 4 | group 5..| | row 3 * i +-----------------------+-----------+-----------+-------| * c | ..group 5 | group 6.. | | row 4 * e +-----------+-----------+-----------------------+-------+ * 1 |..group 6 | group 7 | | row 5 * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * l | group 8 | group 9..| | row 6 * i +-----------------------------------------------+-------| * c | ..group 9 | group 10.. | | row 7 * e +-----------------------+-----------------------+-------+ * 2 |..group 10 | group 11 | | row 8 * +-----------+-----------------------------------+-------+ * * This layout has several advantages over requiring that each row contain * a whole number of groups. * * 1. The group count is not a relevant parameter when defining a dRAID * layout. Only the group width is needed, and *all* groups will have * the desired size. * * 2. All possible group widths (<= physical disk count) can be supported. * * 3. The logic within vdev_draid.c is simplified when the group width is * the same for all groups (although some of the logic around computing * permutation numbers and drive offsets is more complicated). * * N.B. The following array describes all valid dRAID permutation maps. * Each row is used to generate a permutation map for a different number * of children from a unique seed. The seeds were generated and carefully * evaluated by the 'draid' utility in order to provide balanced mappings. * In addition to the seed a checksum of the in-memory mapping is stored * for verification. * * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, * with a given permutation map) is the ratio of the amounts of I/O that will * be sent to the least and most busy disks when resilvering. The average * imbalance ratio (of a given number of disks and permutation map) is the * average of the ratios of all possible single and double disk failures. * * In order to achieve a low imbalance ratio the number of permutations in * the mapping must be significantly larger than the number of children. * For dRAID the number of permutations has been limited to 512 to minimize * the map size. This does result in a gradually increasing imbalance ratio * as seen in the table below. Increasing the number of permutations for * larger child counts would reduce the imbalance ratio. However, in practice * when there are a large number of children each child is responsible for * fewer total IOs so it's less of a concern. * * Note these values are hard coded and must never be changed. Existing * pools depend on the same mapping always being generated in order to * read and write from the correct locations. Any change would make * existing pools completely inaccessible. */ static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ }; /* * Verify the map is valid. Each device index must appear exactly * once in every row, and the permutation array checksum must match. */ static int verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, uint64_t checksum) { int countssz = sizeof (uint16_t) * children; uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); for (int i = 0; i < nperms; i++) { for (int j = 0; j < children; j++) { uint8_t val = perms[(i * children) + j]; if (val >= children || counts[val] != i) { kmem_free(counts, countssz); return (EINVAL); } counts[val]++; } } if (checksum != 0) { int permssz = sizeof (uint8_t) * children * nperms; zio_cksum_t cksum; fletcher_4_native_varsize(perms, permssz, &cksum); if (checksum != cksum.zc_word[0]) { kmem_free(counts, countssz); return (ECKSUM); } } kmem_free(counts, countssz); return (0); } /* * Generate the permutation array for the draid_map_t. These maps control * the placement of all data in a dRAID. Therefore it's critical that the * seed always generates the same mapping. We provide our own pseudo-random * number generator for this purpose. */ int vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) { VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); VERIFY3U(map->dm_seed, !=, 0); VERIFY3U(map->dm_nperms, !=, 0); VERIFY3P(map->dm_perms, ==, NULL); #ifdef _KERNEL /* * The kernel code always provides both a map_seed and checksum. * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide * a zero checksum when generating new candidate maps. */ VERIFY3U(map->dm_checksum, !=, 0); #endif uint64_t children = map->dm_children; uint64_t nperms = map->dm_nperms; int rowsz = sizeof (uint8_t) * children; int permssz = rowsz * nperms; uint8_t *perms; /* Allocate the permutation array */ perms = vmem_alloc(permssz, KM_SLEEP); /* Setup an initial row with a known pattern */ uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); for (int i = 0; i < children; i++) initial_row[i] = i; uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; uint8_t *current_row, *previous_row = initial_row; /* * Perform a Fisher-Yates shuffle of each row using the previous * row as the starting point. An initial_row with known pattern * is used as the input for the first row. */ for (int i = 0; i < nperms; i++) { current_row = &perms[i * children]; memcpy(current_row, previous_row, rowsz); for (int j = children - 1; j > 0; j--) { uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); uint8_t val = current_row[j]; current_row[j] = current_row[k]; current_row[k] = val; } previous_row = current_row; } kmem_free(initial_row, rowsz); int error = verify_perms(perms, children, nperms, map->dm_checksum); if (error) { vmem_free(perms, permssz); return (error); } *permsp = perms; return (0); } /* * Lookup the fixed draid_map_t for the requested number of children. */ int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); } } return (ENOENT); } /* * Lookup the permutation array and iteration id for the provided offset. */ static void vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, uint8_t **base, uint64_t *iter) { uint64_t ncols = vdc->vdc_children; uint64_t poff = pindex % (vdc->vdc_nperms * ncols); *base = vdc->vdc_perms + (poff / ncols) * ncols; *iter = poff % ncols; } static inline uint64_t vdev_draid_permute_id(vdev_draid_config_t *vdc, uint8_t *base, uint64_t iter, uint64_t index) { return ((base[index] + iter) % vdc->vdc_children); } /* * Return the asize which is the psize rounded up to a full group width. * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +vdev_draid_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; ASSERT3U(asize, !=, 0); ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); return (asize); } /* * Deflate the asize to the psize, this includes stripping parity. */ uint64_t -vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT0(asize % vdc->vdc_groupwidth); return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); } /* * Convert a logical offset to the corresponding group number. */ static uint64_t vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (offset / vdc->vdc_groupsz); } /* * Convert a group number to the logical starting offset for that group. */ static uint64_t vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (group * vdc->vdc_groupsz); } /* * Full stripe writes. When writing, all columns (D+P) are required. Parity * is calculated over all the columns, including empty zero filled sectors, * and each is written to disk. While only the data columns are needed for * a normal read, all of the columns are required for reconstruction when * performing a sequential resilver. * * For "big columns" it's sufficient to map the correct range of the zio ABD. * Partial columns require allocating a gang ABD in order to zero fill the * empty sectors. When the column is empty a zero filled sector must be * mapped. In all cases the data ABDs must be the same size as the parity * ABDs (e.g. rc->rc_size == parity_size). */ static void vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t abd_off = abd_offset; ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small write), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); rc->rc_abd = abd_get_zeros(skip_size); } else if (rc->rc_size == parity_size) { /* this is a "big column" */ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), B_TRUE); } ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); abd_off += rc->rc_size; rc->rc_size = parity_size; } IMPLY(abd_offset != 0, abd_off == zio->io_size); } /* * Scrub/resilver reads. In order to store the contents of the skip sectors * an additional ABD is allocated. The columns are handled in the same way * as a full stripe write except instead of using the zero ABD the newly * allocated skip ABD is used to back the skip sectors. In all cases the * data ABD must be the same size as the parity ABDs. */ static void vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t abd_off = abd_offset; uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, ==, NULL); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, B_FALSE); } for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small read), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, skip_off, skip_size); skip_off += skip_size; } else if (rc->rc_size == parity_size) { /* this is a "big column" */ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); skip_off += skip_size; } uint64_t abd_size = abd_get_size(rc->rc_abd); ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); /* * Increase rc_size so the skip ABD is included in subsequent * parity calculations. */ abd_off += rc->rc_size; rc->rc_size = abd_size; } IMPLY(abd_offset != 0, abd_off == zio->io_size); ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); } /* * Normal reads. In this common case only the columns containing data * are read in to the zio ABDs. Neither the parity columns or empty skip * sectors are read unless the checksum fails verification. In which case * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand * the raid map in order to allow reconstruction using the parity data and * skip sectors. */ static void vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t abd_off = abd_offset; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size > 0) { rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); abd_off += rc->rc_size; } } IMPLY(abd_offset != 0, abd_off == zio->io_size); } /* * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key * difference is that an ABD is allocated to back skip sectors so they may * be read in to memory, verified, and repaired if needed. */ void vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, ==, NULL); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, B_FALSE); } for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small read), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3P(rc->rc_abd, ==, NULL); rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, skip_off, skip_size); skip_off += skip_size; } else if (rc->rc_size == parity_size) { /* this is a "big column", nothing to add */ ASSERT3P(rc->rc_abd, !=, NULL); } else { /* * short data column, add a skip sector and clear * rc_tried to force the entire column to be re-read * thereby including the missing skip sector data * which is needed for reconstruction. */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3P(rc->rc_abd, !=, NULL); ASSERT(!abd_is_gang(rc->rc_abd)); abd_t *read_abd = rc->rc_abd; rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, read_abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); skip_off += skip_size; rc->rc_tried = 0; } /* * Increase rc_size so the empty ABD is included in subsequent * parity calculations. */ rc->rc_size = parity_size; } ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); } /* * Verify that all empty sectors are zero filled before using them to * calculate parity. Otherwise, silent corruption in an empty sector will * result in bad parity being generated. That bad parity will then be * considered authoritative and overwrite the good parity on disk. This * is possible because the checksum is only calculated over the data, * thus it cannot be used to detect damage in empty sectors. */ int vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t skip_off = parity_size - skip_size; uint64_t empty_off = 0; int ret = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, !=, NULL); ASSERT3U(rr->rr_bigcols, >, 0); void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP); for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT3P(rc->rc_abd, !=, NULL); ASSERT3U(rc->rc_size, ==, parity_size); if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off, skip_size) != 0) { vdev_raidz_checksum_error(zio, rc, rc->rc_abd); abd_zero_off(rc->rc_abd, skip_off, skip_size); rc->rc_error = SET_ERROR(ECKSUM); ret++; } empty_off += skip_size; } ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty)); kmem_free(zero_buf, skip_size); return (ret); } /* * Given a logical address within a dRAID configuration, return the physical * address on the first drive in the group that this address maps to * (at position 'start' in permutation number 'perm'). */ static uint64_t vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, uint64_t *perm, uint64_t *start) { vdev_draid_config_t *vdc = vd->vdev_tsd; /* b is the dRAID (parent) sector offset. */ uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t b_offset = logical_offset >> ashift; /* * The height of a row in units of the vdev's minimum sector size. * This is the amount of data written to each disk of each group * in a given permutation. */ uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; /* * We cycle through a disk permutation every groupsz * ngroups chunk * of address space. Note that ngroups * groupsz must be a multiple * of the number of data drives (ndisks) in order to guarantee * alignment. So, for example, if our row height is 16MB, our group * size is 10, and there are 13 data drives in the draid, then ngroups * will be 13, we will change permutation every 2.08GB and each * disk will have 160MB of data per chunk. */ uint64_t groupwidth = vdc->vdc_groupwidth; uint64_t ngroups = vdc->vdc_ngroups; uint64_t ndisks = vdc->vdc_ndisks; /* * groupstart is where the group this IO will land in "starts" in * the permutation array. */ uint64_t group = logical_offset / vdc->vdc_groupsz; uint64_t groupstart = (group * groupwidth) % ndisks; ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); *start = groupstart; /* b_offset is the sector offset within a group chunk */ b_offset = b_offset % (rowheight_sectors * groupwidth); ASSERT0(b_offset % groupwidth); /* * Find the starting byte offset on each child vdev: * - within a permutation there are ngroups groups spread over the * rows, where each row covers a slice portion of the disk * - each permutation has (groupwidth * ngroups) / ndisks rows * - so each permutation covers rows * slice portion of the disk * - so we need to find the row where this IO group target begins */ *perm = group / ngroups; uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + (((group % ngroups) * groupwidth) / ndisks); return (((rowheight_sectors * row) + (b_offset / groupwidth)) << ashift); } static uint64_t vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, uint64_t abd_offset, uint64_t abd_size) { vdev_t *vd = zio->io_vd; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); + uint64_t io_asize = vdev_draid_psize_to_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); /* * Limit the io_size to the space remaining in the group. A second * row in the raidz_map_t is created for the remainder. */ if (io_offset + io_asize > start_offset) { io_size = vdev_draid_asize_to_psize(vd, - start_offset - io_offset); + start_offset - io_offset, 0); } /* * At most a block may span the logical end of one group and the start * of the next group. Therefore, at the end of a group the io_size must * span the group width evenly and the remainder must be aligned to the * start of the next group. */ IMPLY(abd_offset == 0 && io_size < zio->io_size, (io_asize >> ashift) % vdc->vdc_groupwidth == 0); IMPLY(abd_offset != 0, vdev_draid_group_to_offset(vd, group) == io_offset); /* Lookup starting byte offset on each child vdev */ uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, io_offset, &perm, &groupstart); /* * If there is less than groupwidth drives available after the group * start, the group is going to wrap onto the next row. 'wrap' is the * group disk number that starts on the next row. */ uint64_t ndisks = vdc->vdc_ndisks; uint64_t groupwidth = vdc->vdc_groupwidth; uint64_t wrap = groupwidth; if (groupstart + groupwidth > ndisks) wrap = ndisks - groupstart; /* The io size in units of the vdev's minimum sector size. */ const uint64_t psize = io_size >> ashift; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ uint64_t q = psize / vdc->vdc_ndata; /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = psize - q * vdc->vdc_ndata; /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); ASSERT3U(bc, <, groupwidth); /* The total number of data and parity sectors for this I/O. */ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); ASSERT3U(vdc->vdc_nparity, >, 0); raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio); rr->rr_bigcols = bc; rr->rr_firstdatacol = vdc->vdc_nparity; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; #endif *rrp = rr; uint8_t *base; uint64_t iter, asize = 0; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < groupwidth; i++) { raidz_col_t *rc = &rr->rr_col[i]; uint64_t c = (groupstart + i) % ndisks; /* increment the offset if we wrap to the next row */ if (i == wrap) physical_offset += VDEV_DRAID_ROWHEIGHT; rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; if (q == 0 && i >= bc) rc->rc_size = 0; else if (i < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rr->rr_nempty = roundup(tot, groupwidth) - tot; IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); /* Allocate buffers for the parity columns */ for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* * Map buffers for data columns and allocate/map buffers for skip * sectors. There are three distinct cases for dRAID which are * required to support sequential rebuild. */ if (zio->io_type == ZIO_TYPE_WRITE) { vdev_draid_map_alloc_write(zio, abd_offset, rr); } else if ((rr->rr_nempty > 0) && (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { vdev_draid_map_alloc_scrub(zio, abd_offset, rr); } else { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); vdev_draid_map_alloc_read(zio, abd_offset, rr); } return (io_size); } /* * Allocate the raidz mapping to be applied to the dRAID I/O. The parity * calculations for dRAID are identical to raidz however there are a few * differences in the layout. * * - dRAID always allocates a full stripe width. Any extra sectors due * this padding are zero filled and written to disk. They will be read * back during a scrub or repair operation since they are included in * the parity calculation. This property enables sequential resilvering. * * - When the block at the logical offset spans redundancy groups then two * rows are allocated in the raidz_map_t. One row resides at the end of * the first group and the other at the start of the following group. */ static raidz_map_t * vdev_draid_map_alloc(zio_t *zio) { raidz_row_t *rr[2]; uint64_t abd_offset = 0; uint64_t abd_size = zio->io_size; uint64_t io_offset = zio->io_offset; uint64_t size; int nrows = 1; size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, abd_offset, abd_size); if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size, 0); + io_offset += vdev_draid_psize_to_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( vd, vdev_draid_offset_to_group(vd, io_offset))); ASSERT3U(abd_offset, <, zio->io_size); ASSERT3U(abd_size, !=, 0); size = vdev_draid_map_alloc_row(zio, &rr[1], io_offset, abd_offset, abd_size); VERIFY3U(size, ==, abd_size); } raidz_map_t *rm; rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); rm->rm_ops = vdev_raidz_math_get_ops(); rm->rm_nrows = nrows; rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; return (rm); } /* * Given an offset into a dRAID return the next group width aligned offset * which can be used to start an allocation. */ static uint64_t vdev_draid_get_astart(vdev_t *vd, const uint64_t start) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); } /* * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) * rounded down to the last full slice. So each child must provide at least * 1 / (children - nspares) of its asize. */ static uint64_t vdev_draid_min_asize(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (VDEV_DRAID_REFLOW_RESERVE + (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); } /* * When using dRAID the minimum allocation size is determined by the number * of data disks in the redundancy group. Full stripes are always used. */ static uint64_t vdev_draid_min_alloc(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (vdc->vdc_ndata << vd->vdev_ashift); } /* * Returns true if the txg range does not exist on any leaf vdev. * * A dRAID spare does not fit into the DTL model. While it has child vdevs * there is no redundancy among them, and the effective child vdev is * determined by offset. Essentially we do a vdev_dtl_reassess() on the * fly by replacing a dRAID spare with the child vdev under the offset. * Note that it is a recursive process because the child vdev can be * another dRAID spare and so on. */ boolean_t vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, uint64_t size) { if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { /* * Check all of the readable children, if any child * contains the txg range the data it is not missing. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (!vdev_draid_missing(cvd, physical_offset, txg, size)) return (B_FALSE); } return (B_TRUE); } if (vd->vdev_ops == &vdev_draid_spare_ops) { /* * When sequentially resilvering we don't have a proper * txg range so instead we must presume all txgs are * missing on this vdev until the resilver completes. */ if (vd->vdev_rebuild_txg != 0) return (B_TRUE); /* * DTL_MISSING is set for all prior txgs when a resilver * is started in spa_vdev_attach(). */ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) return (B_TRUE); /* * Consult the DTL on the relevant vdev. Either a vdev * leaf or spare/replace mirror child may be returned so * we must recursively call vdev_draid_missing_impl(). */ vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_TRUE); return (vdev_draid_missing(vd, physical_offset, txg, size)); } return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Returns true if the txg is only partially replicated on the leaf vdevs. */ static boolean_t vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, uint64_t size) { if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { /* * Check all of the readable children, if any child is * missing the txg range then it is partially replicated. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (vdev_draid_partial(cvd, physical_offset, txg, size)) return (B_TRUE); } return (B_FALSE); } if (vd->vdev_ops == &vdev_draid_spare_ops) { /* * When sequentially resilvering we don't have a proper * txg range so instead we must presume all txgs are * missing on this vdev until the resilver completes. */ if (vd->vdev_rebuild_txg != 0) return (B_TRUE); /* * DTL_MISSING is set for all prior txgs when a resilver * is started in spa_vdev_attach(). */ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) return (B_TRUE); /* * Consult the DTL on the relevant vdev. Either a vdev * leaf or spare/replace mirror child may be returned so * we must recursively call vdev_draid_missing_impl(). */ vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_TRUE); return (vdev_draid_partial(vd, physical_offset, txg, size)); } return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Determine if the vdev is readable at the given offset. */ boolean_t vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) { if (vd->vdev_ops == &vdev_draid_spare_ops) { vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_FALSE); } if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (vdev_draid_readable(cvd, physical_offset)) return (B_TRUE); } return (B_FALSE); } return (vdev_readable(vd)); } /* * Returns the first distributed spare found under the provided vdev tree. */ static vdev_t * vdev_draid_find_spare(vdev_t *vd) { if (vd->vdev_ops == &vdev_draid_spare_ops) return (vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); if (svd != NULL) return (svd); } return (NULL); } /* * Returns B_TRUE if the passed in vdev is currently "faulted". * Faulted, in this context, means that the vdev represents a * replacing or sparing vdev tree. */ static boolean_t vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) { if (vd->vdev_ops == &vdev_draid_spare_ops) { vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_FALSE); /* * After resolving the distributed spare to a leaf vdev * check the parent to determine if it's "faulted". */ vd = vd->vdev_parent; } return (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); } /* * Determine if the dRAID block at the logical offset is degraded. * Used by sequential resilver. */ static boolean_t vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, offset, &perm, &groupstart); uint8_t *base; uint64_t iter; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); vdev_t *cvd = vd->vdev_child[cid]; /* Group contains a faulted vdev. */ if (vdev_draid_faulted(cvd, physical_offset)) return (B_TRUE); /* * Always check groups with active distributed spares * because any vdev failure in the pool will affect them. */ if (vdev_draid_find_spare(cvd) != NULL) return (B_TRUE); } return (B_FALSE); } /* * Determine if the txg is missing. Used by healing resilver. */ static boolean_t vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, offset, &perm, &groupstart); uint8_t *base; uint64_t iter; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); vdev_t *cvd = vd->vdev_child[cid]; /* Transaction group is known to be partially replicated. */ if (vdev_draid_partial(cvd, physical_offset, txg, size)) return (B_TRUE); /* * Always check groups with active distributed spares * because any vdev failure in the pool will affect them. */ if (vdev_draid_find_spare(cvd) != NULL) return (B_TRUE); } return (B_FALSE); } /* * Find the smallest child asize and largest sector size to calculate the * available capacity. Distributed spares are ignored since their capacity * is also based of the minimum child size in the top-level dRAID. */ static void vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) { uint64_t logical_ashift = 0, physical_ashift = 0; uint64_t asize = 0, max_asize = 0; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_ops == &vdev_draid_spare_ops) continue; asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); } for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_ops == &vdev_draid_spare_ops) continue; physical_ashift = vdev_best_ashift(logical_ashift, physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; *max_asizep = max_asize; *logical_ashiftp = logical_ashift; *physical_ashiftp = physical_ashift; } /* * Open spare vdevs. */ static boolean_t vdev_draid_open_spares(vdev_t *vd) { return (vd->vdev_ops == &vdev_draid_spare_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); } /* * Open all children, excluding spares. */ static boolean_t vdev_draid_open_children(vdev_t *vd) { return (!vdev_draid_open_spares(vd)); } /* * Open a top-level dRAID vdev. */ static int vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t nparity = vdc->vdc_nparity; int open_errors = 0; if (nparity > VDEV_DRAID_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * First open the normal children then the distributed spares. This * ordering is important to ensure the distributed spares calculate * the correct psize in the event that the dRAID vdevs were expanded. */ vdev_open_children_subset(vd, vdev_draid_open_children); vdev_open_children_subset(vd, vdev_draid_open_spares); /* Verify enough of the children are available to continue. */ for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_open_error != 0) { if ((++open_errors) > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (SET_ERROR(ENXIO)); } } } /* * Allocatable capacity is the sum of the space on all children less * the number of distributed spares rounded down to last full row * and then to the last full group. An additional 32MB of scratch * space is reserved at the end of each child for use by the dRAID * expansion feature. */ uint64_t child_asize, child_max_asize; vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, logical_ashift, physical_ashift); /* * Should be unreachable since the minimum child size is 64MB, but * we want to make sure an underflow absolutely cannot occur here. */ if (child_asize < VDEV_DRAID_REFLOW_RESERVE || child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { return (SET_ERROR(ENXIO)); } child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); return (0); } /* * Close a top-level dRAID vdev. */ static void vdev_draid_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } /* * Return the maximum asize for a rebuild zio in the provided range * given the following constraints. A dRAID chunks may not: * * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or * - Span dRAID redundancy groups. */ static uint64_t vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t max_segment) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t ashift = vd->vdev_ashift; uint64_t ndata = vdc->vdc_ndata; uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), SPA_MAXBLOCKSIZE); ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); /* Chunks must evenly span all data columns in the group. */ psize = (((psize >> ashift) / ndata) * ndata) << ashift; uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); /* Reduce the chunk size to the group space remaining. */ uint64_t group = vdev_draid_offset_to_group(vd, start); uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; chunk_size = MIN(chunk_size, left); ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, vdev_draid_offset_to_group(vd, start + chunk_size - 1)); return (chunk_size); } /* * Align the start of the metaslab to the group width and slightly reduce * its size to a multiple of the group width. Since full stripe writes are * required by dRAID this space is unallocable. Furthermore, aligning the * metaslab start is important for vdev initialize and TRIM which both operate * on metaslab boundaries which vdev_xlate() expects to be aligned. */ static void vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; uint64_t astart = vdev_draid_get_astart(vd, *ms_start); uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; *ms_start = astart; *ms_size = asize; ASSERT0(*ms_start % sz); ASSERT0(*ms_size % sz); } /* * Add virtual dRAID spares to the list of valid spares. In order to accomplish * this the existing array must be freed and reallocated with the additional * entries. */ int vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, uint64_t next_vdev_id) { uint64_t draid_nspares = 0; uint64_t ndraid = 0; int error; for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_ops == &vdev_draid_ops) { vdev_draid_config_t *vdc = cvd->vdev_tsd; draid_nspares += vdc->vdc_nspares; ndraid++; } } if (draid_nspares == 0) { *ndraidp = ndraid; return (0); } nvlist_t **old_spares, **new_spares; uint_t old_nspares; error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &old_spares, &old_nspares); if (error) old_nspares = 0; /* Allocate memory and copy of the existing spares. */ new_spares = kmem_alloc(sizeof (nvlist_t *) * (draid_nspares + old_nspares), KM_SLEEP); for (uint_t i = 0; i < old_nspares; i++) new_spares[i] = fnvlist_dup(old_spares[i]); /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ uint64_t n = old_nspares; for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { vdev_t *cvd = vd->vdev_child[vdev_id]; char path[64]; if (cvd->vdev_ops != &vdev_draid_ops) continue; vdev_draid_config_t *vdc = cvd->vdev_tsd; uint64_t nspares = vdc->vdc_nspares; uint64_t nparity = vdc->vdc_nparity; for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { memset(path, 0, sizeof (path)); (void) snprintf(path, sizeof (path) - 1, "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, (u_longlong_t)nparity, (u_longlong_t)next_vdev_id + vdev_id, (u_longlong_t)spare_id); nvlist_t *spare = fnvlist_alloc(); fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE); fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, cvd->vdev_guid); fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, spare_id); fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, cvd->vdev_ashift); new_spares[n] = spare; n++; } } if (n > 0) { (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t **)new_spares, n); } for (int i = 0; i < n; i++) nvlist_free(new_spares[i]); kmem_free(new_spares, sizeof (*new_spares) * n); *ndraidp = ndraid; return (0); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. */ static boolean_t vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize, 0); + uint64_t asize = vdev_draid_psize_to_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* * Sequential resilver. There is no meaningful phys_birth * for this block, we can only determine if block resides * in a degraded group in which case it must be resilvered. */ ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, vdev_draid_offset_to_group(vd, offset + asize - 1)); return (vdev_draid_group_degraded(vd, offset)); } else { /* * Healing resilver. TXGs not in DTL_PARTIAL are intact, * as are blocks in non-degraded groups. */ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) return (B_TRUE); /* The block may span groups in which case check both. */ if (vdev_draid_offset_to_group(vd, offset) != vdev_draid_offset_to_group(vd, offset + asize - 1)) { if (vdev_draid_group_missing(vd, offset + asize, phys_birth, 1)) return (B_TRUE); } return (B_FALSE); } } static boolean_t vdev_draid_rebuilding(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (vdev_draid_rebuilding(vd->vdev_child[i])) { return (B_TRUE); } } return (B_FALSE); } static void vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size, 0); + vdev_draid_psize_to_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); #endif } /* * For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() * if a skip sector needs to be added to a column. */ static void vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * Empty columns are zero filled and included in the parity * calculation and therefore must be written. */ ASSERT3U(rc->rc_size, !=, 0); /* Verify physical to logical translation */ vdev_draid_io_verify(vd, rr, c); zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } /* * For read operations: * 1. The vdev_draid_map_alloc() function will create a minimal raidz * mapping for the read based on the zio->io_flags. There are two * possible mappings either 1) a normal read, or 2) a scrub/resilver. * 2. Create the zio read operations. This will include all parity * columns and skip sectors for a scrub/resilver. */ static void vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; /* Sequential rebuild must do IO at redundancy group boundary. */ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); /* * Iterate over the columns in reverse order so that we hit the parity * last. Any errors along the way will force us to read the parity. * For scrub/resilver IOs which verify skip sectors, a gang ABD will * have been allocated to store them and rc->rc_size is increased. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_draid_readable(cvd, rc->rc_offset)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; rc->rc_skipped = 1; continue; } if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } /* * Empty columns may be read during vdev_draid_io_done(). * Only skip them after the readable and missing checks * verify they are available. */ if (rc->rc_size == 0) { rc->rc_skipped = 1; continue; } if (zio->io_flags & ZIO_FLAG_RESILVER) { vdev_t *svd; /* * Sequential rebuilds need to always consider the data * on the child being rebuilt to be stale. This is * important when all columns are available to aid * known reconstruction in identifing which columns * contain incorrect data. * * Furthermore, all repairs need to be constrained to * the devices being rebuilt because without a checksum * we cannot verify the data is actually correct and * performing an incorrect repair could result in * locking in damage and making the data unrecoverable. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { if (vdev_draid_rebuilding(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; rc->rc_allow_repair = 1; continue; } else { rc->rc_allow_repair = 0; } } else { rc->rc_allow_repair = 1; } /* * If this child is a distributed spare then the * offset might reside on the vdev being replaced. * In which case this data must be written to the * new device. Failure to do so would result in * checksum errors when the old device is detached * and the pool is scrubbed. */ if ((svd = vdev_draid_find_spare(cvd)) != NULL) { svd = vdev_draid_spare_get_child(svd, rc->rc_offset); if (svd && (svd->vdev_ops == &vdev_spare_ops || svd->vdev_ops == &vdev_replacing_ops)) { rc->rc_force_repair = 1; if (vdev_draid_rebuilding(svd)) rc->rc_allow_repair = 1; } } /* * Always issue a repair IO to this child when its * a spare or replacing vdev with an active rebuild. */ if ((cvd->vdev_ops == &vdev_spare_ops || cvd->vdev_ops == &vdev_replacing_ops) && vdev_draid_rebuilding(cvd)) { rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } } } /* * Either a parity or data column is missing this means a repair * may be attempted by vdev_draid_io_done(). Expand the raid map * to read in empty columns which are needed along with the parity * during reconstruction. */ if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { vdev_draid_map_alloc_empty(zio, rr); } for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_error || rc->rc_size == 0) continue; if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } /* * Start an IO operation to a dRAID vdev. */ static void vdev_draid_io_start(zio_t *zio) { vdev_t *vd __maybe_unused = zio->io_vd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); raidz_map_t *rm = vdev_draid_map_alloc(zio); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_draid_io_start_write(zio, rm->rm_row[i]); } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); for (int i = 0; i < rm->rm_nrows; i++) { vdev_draid_io_start_read(zio, rm->rm_row[i]); } } zio_execute(zio); } /* * Complete an IO operation on a dRAID vdev. The raidz logic can be applied * to dRAID since the layout is fully described by the raidz_map_t. */ static void vdev_draid_io_done(zio_t *zio) { vdev_raidz_io_done(zio); } static void vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT(vd->vdev_ops == &vdev_draid_ops); if (faulted > vdc->vdc_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } static void vdev_draid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_draid_ops); vdev_draid_config_t *vdc = raidvd->vdev_tsd; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* Make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t logical_start = logical_rs->rs_start; uint64_t logical_end = logical_rs->rs_end; /* * Unaligned ranges must be skipped. All metaslabs are correctly * aligned so this should not happen, but this case is handled in * case it's needed by future callers. */ uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); if (astart != logical_start) { physical_rs->rs_start = logical_start; physical_rs->rs_end = logical_start; remain_rs->rs_start = MIN(astart, logical_end); remain_rs->rs_end = logical_end; return; } /* * Unlike with mirrors and raidz a dRAID logical range can map * to multiple non-contiguous physical ranges. This is handled by * limiting the size of the logical range to a single group and * setting the remain argument such that it describes the remaining * unmapped logical range. This is stricter than absolutely * necessary but helps simplify the logic below. */ uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); if (logical_end > nextstart) logical_end = nextstart; /* Find the starting offset for each vdev in the group */ uint64_t perm, groupstart; uint64_t start = vdev_draid_logical_to_physical(raidvd, logical_start, &perm, &groupstart); uint64_t end = start; uint8_t *base; uint64_t iter, id; vdev_draid_get_perm(vdc, perm, &base, &iter); /* * Check if the passed child falls within the group. If it does * update the start and end to reflect the physical range. * Otherwise, leave them unmodified which will result in an empty * (zero-length) physical range being returned. */ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; if (c == 0 && i != 0) { /* the group wrapped, increment the start */ start += VDEV_DRAID_ROWHEIGHT; end = start; } id = vdev_draid_permute_id(vdc, base, iter, c); if (id == cvd->vdev_id) { uint64_t b_size = (logical_end >> ashift) - (logical_start >> ashift); ASSERT3U(b_size, >, 0); end = start + ((((b_size - 1) / vdc->vdc_groupwidth) + 1) << ashift); break; } } physical_rs->rs_start = start; physical_rs->rs_end = end; /* * Only top-level vdevs are allowed to set remain_rs because * when .vdev_op_xlate() is called for their children the full * logical range is not provided by vdev_xlate(). */ remain_rs->rs_start = logical_end; remain_rs->rs_end = logical_rs->rs_end; ASSERT3U(physical_rs->rs_start, <=, logical_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_end - logical_start); } /* * Add dRAID specific fields to the config nvlist. */ static void vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); vdev_draid_config_t *vdc = vd->vdev_tsd; fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); } /* * Initialize private dRAID specific fields from the nvlist. */ static int vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) { (void) spa; uint64_t ndata, nparity, nspares, ngroups; int error; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { return (SET_ERROR(EINVAL)); } uint_t children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children == 0 || children > VDEV_DRAID_MAX_CHILDREN) { return (SET_ERROR(EINVAL)); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || nspares > 100 || nspares > (children - (ndata + nparity))) { return (SET_ERROR(EINVAL)); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { return (SET_ERROR(EINVAL)); } /* * Validate the minimum number of children exist per group for the * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). */ if (children < (ndata + nparity + nspares)) return (SET_ERROR(EINVAL)); /* * Create the dRAID configuration using the pool nvlist configuration * and the fixed mapping for the correct number of children. */ vdev_draid_config_t *vdc; const draid_map_t *map; error = vdev_draid_lookup_map(children, &map); if (error) return (SET_ERROR(EINVAL)); vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); vdc->vdc_ndata = ndata; vdc->vdc_nparity = nparity; vdc->vdc_nspares = nspares; vdc->vdc_children = children; vdc->vdc_ngroups = ngroups; vdc->vdc_nperms = map->dm_nperms; error = vdev_draid_generate_perms(map, &vdc->vdc_perms); if (error) { kmem_free(vdc, sizeof (*vdc)); return (SET_ERROR(EINVAL)); } /* * Derived constants. */ vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / vdc->vdc_ndisks; ASSERT3U(vdc->vdc_groupwidth, >=, 2); ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % vdc->vdc_ndisks, ==, 0); *tsd = vdc; return (0); } static void vdev_draid_fini(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; vmem_free(vdc->vdc_perms, sizeof (uint8_t) * vdc->vdc_children * vdc->vdc_nperms); kmem_free(vdc, sizeof (*vdc)); } static uint64_t vdev_draid_nparity(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; return (vdc->vdc_nparity); } static uint64_t vdev_draid_ndisks(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; return (vdc->vdc_ndisks); } vdev_ops_t vdev_draid_ops = { .vdev_op_init = vdev_draid_init, .vdev_op_fini = vdev_draid_fini, .vdev_op_open = vdev_draid_open, .vdev_op_close = vdev_draid_close, - .vdev_op_asize = vdev_draid_asize, + .vdev_op_psize_to_asize = vdev_draid_psize_to_asize, + .vdev_op_asize_to_psize = vdev_draid_asize_to_psize, .vdev_op_min_asize = vdev_draid_min_asize, .vdev_op_min_alloc = vdev_draid_min_alloc, .vdev_op_io_start = vdev_draid_io_start, .vdev_op_io_done = vdev_draid_io_done, .vdev_op_state_change = vdev_draid_state_change, .vdev_op_need_resilver = vdev_draid_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_draid_xlate, .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, .vdev_op_metaslab_init = vdev_draid_metaslab_init, .vdev_op_config_generate = vdev_draid_config_generate, .vdev_op_nparity = vdev_draid_nparity, .vdev_op_ndisks = vdev_draid_ndisks, .vdev_op_type = VDEV_TYPE_DRAID, .vdev_op_leaf = B_FALSE, }; /* * A dRAID distributed spare is a virtual leaf vdev which is included in the * parent dRAID configuration. The last N columns of the dRAID permutation * table are used to determine on which dRAID children a specific offset * should be written. These spare leaf vdevs can only be used to replace * faulted children in the same dRAID configuration. */ /* * Distributed spare state. All fields are set when the distributed spare is * first opened and are immutable. */ typedef struct { vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ uint64_t vds_top_guid; /* top-level parent dRAID guid */ uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ } vdev_draid_spare_t; /* * Returns the parent dRAID vdev to which the distributed spare belongs. * This may be safely called even when the vdev is not open. */ vdev_t * vdev_draid_spare_get_parent(vdev_t *vd) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); if (vds->vds_draid_vdev != NULL) return (vds->vds_draid_vdev); return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, vds->vds_top_guid)); } /* * A dRAID space is active when it's the child of a vdev using the * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. */ static boolean_t vdev_draid_spare_is_active(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_draid_ops)) { return (B_TRUE); } else { return (B_FALSE); } } /* * Given a dRAID distribute spare vdev, returns the physical child vdev * on which the provided offset resides. This may involve recursing through * multiple layers of distributed spares. Note that offset is relative to * this vdev. */ vdev_t * vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); /* The vdev is closed */ if (vds->vds_draid_vdev == NULL) return (NULL); vdev_t *tvd = vds->vds_draid_vdev; vdev_draid_config_t *vdc = tvd->vdev_tsd; ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); uint8_t *base; uint64_t iter; uint64_t perm = physical_offset / vdc->vdc_devslicesz; vdev_draid_get_perm(vdc, perm, &base, &iter); uint64_t cid = vdev_draid_permute_id(vdc, base, iter, (tvd->vdev_children - 1) - vds->vds_spare_id); vdev_t *cvd = tvd->vdev_child[cid]; if (cvd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_spare_get_child(cvd, physical_offset)); return (cvd); } static void vdev_draid_spare_close(vdev_t *vd) { vdev_draid_spare_t *vds = vd->vdev_tsd; vds->vds_draid_vdev = NULL; } /* * Opening a dRAID spare device is done by looking up the associated dRAID * top-level vdev guid from the spare configuration. */ static int vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_draid_spare_t *vds = vd->vdev_tsd; vdev_t *rvd = vd->vdev_spa->spa_root_vdev; uint64_t asize, max_asize; vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); if (tvd == NULL) { /* * When spa_vdev_add() is labeling new spares the * associated dRAID is not attached to the root vdev * nor does this spare have a parent. Simulate a valid * device in order to allow the label to be initialized * and the distributed spare added to the configuration. */ if (vd->vdev_parent == NULL) { *psize = *max_psize = SPA_MINDEVSIZE; *logical_ashift = *physical_ashift = ASHIFT_MIN; return (0); } return (SET_ERROR(EINVAL)); } vdev_draid_config_t *vdc = tvd->vdev_tsd; if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) return (SET_ERROR(EINVAL)); if (vds->vds_spare_id >= vdc->vdc_nspares) return (SET_ERROR(EINVAL)); /* * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here * because the caller may be vdev_draid_open() in which case the * values are stale as they haven't yet been updated by vdev_open(). * To avoid this always recalculate the dRAID asize and max_asize. */ vdev_draid_calculate_asize(tvd, &asize, &max_asize, logical_ashift, physical_ashift); *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vds->vds_draid_vdev = tvd; return (0); } /* * Completed distributed spare IO. Store the result in the parent zio * as if it had performed the operation itself. Only the first error is * preserved if there are multiple errors. */ static void vdev_draid_spare_child_done(zio_t *zio) { zio_t *pio = zio->io_private; /* * IOs are issued to non-writable vdevs in order to keep their * DTLs accurate. However, we don't want to propagate the * error in to the distributed spare's DTL. When resilvering * vdev_draid_need_resilver() will consult the relevant DTL * to determine if the data is missing and must be repaired. */ if (!vdev_writeable(zio->io_vd)) return; if (pio->io_error == 0) pio->io_error = zio->io_error; } /* * Returns a valid label nvlist for the distributed spare vdev. This is * used to bypass the IO pipeline to avoid the complexity of constructing * a complete label with valid checksum to return when read. */ nvlist_t * vdev_draid_read_config_spare(vdev_t *vd) { spa_t *spa = vd->vdev_spa; spa_aux_vdev_t *sav = &spa->spa_spares; uint64_t guid = vd->vdev_guid; nvlist_t *nv = fnvlist_alloc(); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, vdev_draid_spare_is_active(vd) ? POOL_STATE_ACTIVE : POOL_STATE_SPARE); /* Set the vdev guid based on the vdev list in sav_count. */ for (int i = 0; i < sav->sav_count; i++) { if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { guid = sav->sav_vdevs[i]->vdev_guid; break; } } fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); return (nv); } /* * Handle any flush requested of the distributed spare. All children must be * flushed. */ static int vdev_draid_spare_flush(zio_t *zio) { vdev_t *vd = zio->io_vd; int error = 0; for (int c = 0; c < vd->vdev_children; c++) { zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[c], zio->io_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } return (error); } /* * Initiate an IO to the distributed spare. For normal IOs this entails using * the zio->io_offset and permutation table to calculate which child dRAID vdev * is responsible for the data. Then passing along the zio to that child to * perform the actual IO. The label ranges are not stored on disk and require * some special handling which is described below. */ static void vdev_draid_spare_io_start(zio_t *zio) { vdev_t *cvd = NULL, *vd = zio->io_vd; vdev_draid_spare_t *vds = vd->vdev_tsd; uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vds == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_FLUSH: zio->io_error = vdev_draid_spare_flush(zio); break; case ZIO_TYPE_WRITE: if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { /* * Accept probe IOs and config writers to simulate the * existence of an on disk label. vdev_label_sync(), * vdev_uberblock_sync() and vdev_copy_uberblocks() * skip the distributed spares. This only leaves * vdev_label_init() which is allowed to succeed to * avoid adding special cases the function. */ if (zio->io_flags & ZIO_FLAG_PROBE || zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { zio->io_error = 0; } else { zio->io_error = SET_ERROR(EIO); } } else { cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } } break; case ZIO_TYPE_READ: if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { /* * Accept probe IOs to simulate the existence of a * label. vdev_label_read_config() bypasses the * pipeline to read the label configuration and * vdev_uberblock_load() skips distributed spares * when attempting to locate the best uberblock. */ if (zio->io_flags & ZIO_FLAG_PROBE) { zio->io_error = 0; } else { zio->io_error = SET_ERROR(EIO); } } else { cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL || !vdev_readable(cvd)) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } } break; case ZIO_TYPE_TRIM: /* The vdev label ranges are never trimmed */ ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL || !cvd->vdev_has_trim) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } break; default: zio->io_error = SET_ERROR(ENOTSUP); break; } zio_execute(zio); } static void vdev_draid_spare_io_done(zio_t *zio) { (void) zio; } /* * Lookup the full spare config in spa->spa_spares.sav_config and * return the top_guid and spare_id for the named spare. */ static int vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, uint64_t *spare_idp) { nvlist_t **spares; uint_t nspares; int error; if ((spa->spa_spares.sav_config == NULL) || (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { return (SET_ERROR(ENOENT)); } const char *spare_name; error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); if (error != 0) return (SET_ERROR(EINVAL)); for (int i = 0; i < nspares; i++) { nvlist_t *spare = spares[i]; uint64_t top_guid, spare_id; const char *type, *path; /* Skip non-distributed spares */ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) continue; /* Skip spares with the wrong name */ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); if (error != 0 || strcmp(path, spare_name) != 0) continue; /* Found the matching spare */ error = nvlist_lookup_uint64(spare, ZPOOL_CONFIG_TOP_GUID, &top_guid); if (error == 0) { error = nvlist_lookup_uint64(spare, ZPOOL_CONFIG_SPARE_ID, &spare_id); } if (error != 0) { return (SET_ERROR(EINVAL)); } else { *top_guidp = top_guid; *spare_idp = spare_id; return (0); } } return (SET_ERROR(ENOENT)); } /* * Initialize private dRAID spare specific fields from the nvlist. */ static int vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) { vdev_draid_spare_t *vds; uint64_t top_guid = 0; uint64_t spare_id; /* * In the normal case check the list of spares stored in the spa * to lookup the top_guid and spare_id for provided spare config. * When creating a new pool or adding vdevs the spare list is not * yet populated and the values are provided in the passed config. */ if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, &spare_id) != 0) return (SET_ERROR(EINVAL)); } vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); vds->vds_draid_vdev = NULL; vds->vds_top_guid = top_guid; vds->vds_spare_id = spare_id; *tsd = vds; return (0); } static void vdev_draid_spare_fini(vdev_t *vd) { kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); } static void vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); } vdev_ops_t vdev_draid_spare_ops = { .vdev_op_init = vdev_draid_spare_init, .vdev_op_fini = vdev_draid_spare_fini, .vdev_op_open = vdev_draid_spare_open, .vdev_op_close = vdev_draid_spare_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_draid_spare_io_start, .vdev_op_io_done = vdev_draid_spare_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_draid_spare_config_generate, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DRAID_SPARE, .vdev_op_leaf = B_TRUE, }; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index a2cb6f9b9ef9..f457669bc809 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -1,371 +1,372 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2025, Klara, Inc. */ #include #include #include #include #include #include #include #include #include /* * Virtual device vector for files. */ static taskq_t *vdev_file_taskq; /* * By default, the logical/physical ashift for file vdevs is set to * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9) * blocksizes. Users may opt to change one or both of these for testing * or performance reasons. Care should be taken as these values will * impact the vdev_ashift setting which can only be set at vdev creation * time. */ static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; void vdev_file_init(void) { vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); VERIFY(vdev_file_taskq); } void vdev_file_fini(void) { taskq_destroy(vdev_file_taskq); } static void vdev_file_hold(vdev_t *vd) { ASSERT3P(vd->vdev_path, !=, NULL); } static void vdev_file_rele(vdev_t *vd) { ASSERT3P(vd->vdev_path, !=, NULL); } static mode_t vdev_file_open_mode(spa_mode_t spa_mode) { mode_t mode = 0; if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { mode = O_RDWR; } else if (spa_mode & SPA_MODE_READ) { mode = O_RDONLY; } else if (spa_mode & SPA_MODE_WRITE) { mode = O_WRONLY; } return (mode | O_LARGEFILE); } static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; zfs_file_t *fp; zfs_file_attr_t zfa; int error; /* * Rotational optimizations only make sense on block devices. */ vd->vdev_nonrot = B_TRUE; /* * Allow TRIM on file based vdevs. This may not always be supported, * since it depends on your kernel version and underlying filesystem * type but it is always safe to attempt. */ vd->vdev_has_trim = B_TRUE; /* * Disable secure TRIM on file based vdevs. There is no way to * request this behavior from the underlying filesystem. */ vd->vdev_has_securetrim = B_FALSE; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; goto skip_open; } vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ ASSERT3P(vd->vdev_path, !=, NULL); ASSERT3S(vd->vdev_path[0], ==, '/'); error = zfs_file_open(vd->vdev_path, vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } vf->vf_file = fp; #ifdef _KERNEL /* * Make sure it's a regular file. */ if (zfs_file_getattr(fp, &zfa)) { return (SET_ERROR(ENODEV)); } if (!S_ISREG(zfa.zfa_mode)) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(ENODEV)); } #endif skip_open: error = zfs_file_getattr(vf->vf_file, &zfa); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } *max_psize = *psize = zfa.zfa_size; *logical_ashift = vdev_file_logical_ashift; *physical_ashift = vdev_file_physical_ashift; return (0); } static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_file != NULL) { (void) zfs_file_close(vf->vf_file); } vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } static void vdev_file_io_strategy(void *arg) { zio_t *zio = (zio_t *)arg; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; void *buf; ssize_t resid; loff_t off; ssize_t size; int err; off = zio->io_offset; size = zio->io_size; resid = 0; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_type == ZIO_TYPE_READ) { buf = abd_borrow_buf(zio->io_abd, zio->io_size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); } zio->io_error = err; if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); zio_delay_interrupt(zio); } static void vdev_file_io_fsync(void *arg) { zio_t *zio = (zio_t *)arg; vdev_file_t *vf = zio->io_vd->vdev_tsd; zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); zio_interrupt(zio); } static void vdev_file_io_deallocate(void *arg) { zio_t *zio = (zio_t *)arg; vdev_file_t *vf = zio->io_vd->vdev_tsd; zio->io_error = zfs_file_deallocate(vf->vf_file, zio->io_offset, zio->io_size); zio_interrupt(zio); } static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } if (zfs_nocacheflush) { zio_interrupt(zio); return; } VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_fsync, zio, TQ_SLEEP), !=, TASKQID_INVALID); return; } if (zio->io_type == ZIO_TYPE_TRIM) { ASSERT3U(zio->io_size, !=, 0); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_deallocate, zio, TQ_SLEEP), !=, TASKQID_INVALID); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, TQ_SLEEP), !=, TASKQID_INVALID); } static void vdev_file_io_done(zio_t *zio) { (void) zio; } vdev_ops_t vdev_file_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_file_hold, .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* * From userland we access disks just like files. */ #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_file_hold, .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, "Logical ashift for file-based devices"); ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, "Physical ashift for file-based devices"); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 30d7340f7f4b..b58b87d1fcc7 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1,1924 +1,1925 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - I/Os to this vdev use the callback to determine where the * data is now located, and issue child I/Os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ static int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ static uint_t zfs_condense_indirect_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ uint_t zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This * is intentionally not exposed as a module parameter. */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[]; } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } indirect_child_t *ic; while ((ic = list_remove_head(&is->is_unique_child)) != NULL) ; list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); zfs_range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_condense_indirect_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)vic->vic_mapping_object, (u_longlong_t)new_count, (u_longlong_t)old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ hrtime_t now = gethrtime(); hrtime_t sleep_until = now + MSEC2NSEC( zfs_condense_indirect_commit_entry_delay_ms); zfs_sleep_until(sleep_until); } mapi++; } } static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); ASSERT3U(obsolete_sm_obj, !=, 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(zfs_range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); ASSERT3U(obsolete_sm_object, !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa, minclsyspri); } /* * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj * will contain either the obsolete spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } /* * Gets the obsolete count are precise spacemap object from the vdev's ZAP. * On success are_precise will be set to reflect if the counts are precise. * All other errors are returned to the caller. */ int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *are_precise = B_FALSE; return (0); } uint64_t val = 0; int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); if (error == 0) { *are_precise = (val != 0); } else if (error == ENOENT) { *are_precise = B_FALSE; error = 0; } return (error); } static void vdev_indirect_close(vdev_t *vd) { (void) vd; } static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); memcpy(duplicate_mappings, first_mapping, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); ASSERT3P(v->vdev_indirect_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_free(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * If a child is missing the data, set ic_error. Used * in vdev_indirect_repair(). We perform the read * nevertheless which provides the opportunity to * reconstruct the split block if at all possible. */ if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, zio->io_txg, 1)) ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); ASSERT3P(first, !=, NULL); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_type == ZIO_TYPE_READ && zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * If this is a read zio, we read one copy of each * split segment, from the top-level vdev. Since * we don't know the checksum of each split * individually, the child zio can't ensure that * we get the right data. E.g. if it's a mirror, * it will just read from a random (healthy) leaf * vdev. We have to verify the checksum in * vdev_indirect_io_done(). * * For write zios, the vdev code will ensure we write * to all children. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset_size(zio->io_abd, is->is_split_offset, is->is_size), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = { 0 }; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. The DTL is checked in vdev_indirect_read_all() and * if a vdev is missing a copy of the data we set ic_error and the read is * performed. This provides the opportunity to reconstruct the split block * if at all possible. ic_error is checked here and if set it suppresses * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); /* * If ic_error is set the current child does not have * a copy of the data, so suppress incrementing the * checksum counter. */ if (ic->ic_error == ESTALE) continue; vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = random_in_range(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { int error; /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } if (list_is_empty(&is->is_unique_child)) { error = SET_ERROR(EIO); goto out; } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 12 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max >= (1ULL << 12)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_remove_head(&is->is_unique_child)) != NULL) ; is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) ic_j->ic_duplicate = ic_i; } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level VDEV. */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); ret = 0; } if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_should_condense); EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT, ZMOD_RW, "Minimum obsolete percent of bytes in the mapping " "to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, UINT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, UINT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 9f3dfce01799..a6aee9437066 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -1,1060 +1,1063 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include /* * Vdev mirror kstats */ static kstat_t *mirror_ksp = NULL; typedef struct mirror_stats { kstat_named_t vdev_mirror_stat_rotating_linear; kstat_named_t vdev_mirror_stat_rotating_offset; kstat_named_t vdev_mirror_stat_rotating_seek; kstat_named_t vdev_mirror_stat_non_rotating_linear; kstat_named_t vdev_mirror_stat_non_rotating_seek; kstat_named_t vdev_mirror_stat_preferred_found; kstat_named_t vdev_mirror_stat_preferred_not_found; } mirror_stats_t; static mirror_stats_t mirror_stats = { /* New I/O follows directly the last I/O */ { "rotating_linear", KSTAT_DATA_UINT64 }, /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ { "rotating_offset", KSTAT_DATA_UINT64 }, /* New I/O requires random seek */ { "rotating_seek", KSTAT_DATA_UINT64 }, /* New I/O follows directly the last I/O (nonrot) */ { "non_rotating_linear", KSTAT_DATA_UINT64 }, /* New I/O requires random seek (nonrot) */ { "non_rotating_seek", KSTAT_DATA_UINT64 }, /* Preferred child vdev found */ { "preferred_found", KSTAT_DATA_UINT64 }, /* Preferred child vdev not found or equal load */ { "preferred_not_found", KSTAT_DATA_UINT64 }, }; #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) void vdev_mirror_stat_init(void) { mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", "misc", KSTAT_TYPE_NAMED, sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (mirror_ksp != NULL) { mirror_ksp->ks_data = &mirror_stats; kstat_install(mirror_ksp); } } void vdev_mirror_stat_fini(void) { if (mirror_ksp != NULL) { kstat_delete(mirror_ksp); mirror_ksp = NULL; } } /* * Virtual device vector for mirroring. */ typedef struct mirror_child { vdev_t *mc_vd; abd_t *mc_abd; uint64_t mc_offset; int mc_error; int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { int *mm_preferred; int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; static const int vdev_mirror_shift = 21; /* * The load configuration settings below are tuned by default for * the case where all devices are of the same rotational type. * * If there is a mixture of rotating and non-rotating media, setting * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results * as it will direct more reads to the non-rotating vdevs which are more likely * to have a higher performance. */ /* Rotating media load calculation configuration. */ static int zfs_vdev_mirror_rotating_inc = 0; static int zfs_vdev_mirror_rotating_seek_inc = 5; static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; /* Non-rotating media load calculation configuration. */ static int zfs_vdev_mirror_non_rotating_inc = 0; static int zfs_vdev_mirror_non_rotating_seek_inc = 1; static inline size_t vdev_mirror_map_size(int children) { return (offsetof(mirror_map_t, mm_child[children]) + sizeof (int) * children); } static inline mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) { mirror_map_t *mm; mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); mm->mm_children = children; mm->mm_resilvering = resilvering; mm->mm_root = root; mm->mm_preferred = (int *)((uintptr_t)mm + offsetof(mirror_map_t, mm_child[children])); return (mm); } static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, }; static int vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { uint64_t last_offset; int64_t offset_diff; int load; /* All DVAs have equal weight at the root. */ if (mm->mm_root) return (INT_MAX); /* * We don't return INT_MAX if the device is resilvering i.e. * vdev_resilver_txg != 0 as when tested performance was slightly * worse overall when resilvering with compared to without. */ /* Fix zio_offset for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) zio_offset += VDEV_LABEL_START_SIZE; /* Standard load based on pending queue length. */ load = vdev_queue_length(vd); last_offset = vdev_queue_last_offset(vd); if (vd->vdev_nonrot) { /* Non-rotating media. */ if (last_offset == zio_offset) { MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); return (load + zfs_vdev_mirror_non_rotating_inc); } /* * Apply a seek penalty even for non-rotating devices as * sequential I/O's can be aggregated into fewer operations on * the device, thus avoiding unnecessary per-command overhead * and boosting performance. */ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); return (load + zfs_vdev_mirror_non_rotating_seek_inc); } /* Rotating media I/O's which directly follow the last I/O. */ if (last_offset == zio_offset) { MIRROR_BUMP(vdev_mirror_stat_rotating_linear); return (load + zfs_vdev_mirror_rotating_inc); } /* * Apply half the seek increment to I/O's within seek offset * of the last I/O issued to this vdev as they should incur less * of a seek increment. */ offset_diff = (int64_t)(last_offset - zio_offset); if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { MIRROR_BUMP(vdev_mirror_stat_rotating_offset); return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); } /* Apply the full seek increment to all other I/O's. */ MIRROR_BUMP(vdev_mirror_stat_rotating_seek); return (load + zfs_vdev_mirror_rotating_seek_inc); } static boolean_t vdev_mirror_rebuilding(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (vdev_mirror_rebuilding(vd->vdev_child[i])) { return (B_TRUE); } } return (B_FALSE); } /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dva_t dva_copy[SPA_DVAS_PER_BP]; /* * The sequential scrub code sorts and issues all DVAs * of a bp separately. Each of these IOs includes all * original DVA copies so that repairs can be performed * in the event of an error, but we only actually want * to check the first DVA since the others will be * checked by their respective sorted IOs. Only if we * hit an error will we try all DVAs upon retrying. * * Note: This check is safe even if the user switches * from a legacy scrub to a sequential one in the middle * of processing, since scn_is_sorted isn't updated until * all outstanding IOs from the previous scrub pass * complete. */ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !(zio->io_flags & ZIO_FLAG_IO_RETRY) && dsl_scan_scrubbing(spa->spa_dsl_pool) && scn->scn_is_sorted) { c = 1; } else { c = BP_GET_NDVAS(zio->io_bp); } /* * If the pool cannot be written to, then infer that some * DVAs might be invalid or point to vdevs that do not exist. * We skip them. */ if (!spa_writeable(spa)) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); int j = 0; for (int i = 0; i < c; i++) { if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) dva_copy[j++] = dva[i]; } if (j == 0) { zio->io_vsd = NULL; zio->io_error = ENXIO; return (NULL); } if (j < c) { dva = dva_copy; c = j; } } mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); if (mc->mc_vd == NULL) { kmem_free(mm, vdev_mirror_map_size( mm->mm_children)); zio->io_vsd = NULL; zio->io_error = ENXIO; return (NULL); } } } else { /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. * * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) && spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; if (vdev_mirror_rebuilding(mc->mc_vd)) mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); } for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) continue; *physical_ashift = vdev_best_ashift(*logical_ashift, *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { if (vdev_children_are_offline(vd)) vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; else vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_mirror_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static void vdev_mirror_child_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } /* * Check the other, lower-index DVAs to see if they're on the same * vdev as the child we picked. If they are, use them since they * are likely to have been allocated from the primary metaslab in * use at the time, and hence are more likely to have locality with * single-copy data. */ static int vdev_mirror_dva_select(zio_t *zio, int p) { dva_t *dva = zio->io_bp->blk_dva; mirror_map_t *mm = zio->io_vsd; int preferred; int c; preferred = mm->mm_preferred[p]; for (p--; p >= 0; p--) { c = mm->mm_preferred[p]; if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) preferred = c; } return (preferred); } static int vdev_mirror_preferred_child_randomize(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; int p; if (mm->mm_root) { p = random_in_range(mm->mm_preferred_cnt); return (vdev_mirror_dva_select(zio, p)); } /* * To ensure we don't always favour the first matching vdev, * which could lead to wear leveling issues on SSD's, we * use the I/O offset as a pseudo random seed into the vdevs * which have the lowest load. */ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; return (mm->mm_preferred[p]); } static boolean_t vdev_mirror_child_readable(mirror_child_t *mc) { vdev_t *vd = mc->mc_vd; if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) return (vdev_draid_readable(vd, mc->mc_offset)); else return (vdev_readable(vd)); } static boolean_t vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) { vdev_t *vd = mc->mc_vd; if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); else return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Try to find a vdev whose DTL doesn't contain the block we want to read * preferring vdevs based on determined load. If we can't, try the read on * any vdev we haven't already tried. * * Distributed spares are an exception to the above load rule. They are * always preferred in order to detect gaps in the distributed spare which * are created when another disk in the dRAID fails. In order to restore * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; uint64_t txg = zio->io_txg; int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc; mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; if (mc->mc_vd == NULL || !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { mm->mm_preferred[0] = c; mm->mm_preferred_cnt = 1; break; } mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; if (mc->mc_load < lowest_load) { lowest_load = mc->mc_load; mm->mm_preferred_cnt = 0; } mm->mm_preferred[mm->mm_preferred_cnt] = c; mm->mm_preferred_cnt++; } if (mm->mm_preferred_cnt == 1) { MIRROR_BUMP(vdev_mirror_stat_preferred_found); return (mm->mm_preferred[0]); } if (mm->mm_preferred_cnt > 1) { MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); return (vdev_mirror_preferred_child_randomize(zio)); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) return (c); } /* * Every child failed. There's no place left to look. */ return (-1); } static void vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; mirror_child_t *mc; int c, children; mm = vdev_mirror_map_init(zio); zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); ASSERT(zio->io_type == ZIO_TYPE_READ); zio_execute(zio); return; } if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* * For scrubbing reads we need to issue reads to all * children. One child can reuse parent buffer, but * for others we have to allocate separate ones to * verify checksums if io_bp is non-NULL, or compare * them in vdev_mirror_io_done() otherwise. */ boolean_t first = B_TRUE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; /* Don't issue ZIOs to offline children */ if (!vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; mc->mc_skipped = 1; continue; } mc->mc_abd = first ? zio->io_abd : abd_alloc_sametype(zio->io_abd, zio->io_size); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, mc->mc_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); first = B_FALSE; } zio_execute(zio); return; } /* * For normal reads just pick one child. */ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* * Writes go to all children. */ c = 0; children = mm->mm_children; } while (children--) { mc = &mm->mm_child[c]; c++; /* * When sequentially resilvering only issue write repair * IOs to the vdev which is being rebuilt since performance * is limited by the slowest child. This is an issue for * faster replacement devices such as distributed spares. */ if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && (zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_rebuilding && !mc->mc_rebuilding) { continue; } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); } zio_execute(zio); } static int vdev_mirror_worst_error(mirror_map_t *mm) { int error[2] = { 0, 0 }; for (int c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); } return (error[0] ? error[0] : error[1]); } static void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; mirror_child_t *mc; int c; int good_copies = 0; int unexpected_errors = 0; int last_good_copy = -1; if (mm == NULL) return; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_error) { if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { last_good_copy = c; good_copies++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. * * For ditto blocks (io_vd == NULL), require * all copies to be good. * * XXX -- for replacing vdevs, there's no great answer. * If the old device is really dead, we may not even * be able to access it -- so we only want to * require good writes to the new device. But if * the new device turns out to be flaky, we want * to be able to detach it -- which requires all * writes to the old device to have succeeded. */ if (good_copies == 0 || zio->io_vd == NULL) zio->io_error = vdev_mirror_worst_error(mm); } return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level Mirror VDEV. * * There will be no attampt at reading any additional data copies. If * the buffer is still being manipulated while attempting to read from * another child, there exists a possibly that the checksum could be * verified as valid. However, the buffer contents could again get * manipulated after verifying the checksum. This would lead to bad data * being written out during self healing. */ if ((zio->io_flags & ZIO_FLAG_DIO_READ) && (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { zio_dio_chksum_verify_error_report(zio); zio->io_error = vdev_mirror_worst_error(mm); ASSERT3U(zio->io_error, ==, ECKSUM); return; } /* * If we don't have a good copy yet, keep trying other children. */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; } if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) { abd_t *best_abd = NULL; if (last_good_copy >= 0) best_abd = mm->mm_child[last_good_copy].mc_abd; /* * If we're scrubbing but don't have a BP available (because * this vdev is under a raidz or draid vdev) then the best we * can do is compare all of the copies read. If they're not * identical then return a checksum error and the most likely * correct data. The raidz code will issue a repair I/O if * possible. */ if (zio->io_bp == NULL) { ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops || zio->io_vd->vdev_ops == &vdev_spare_ops); abd_t *pref_abd = NULL; for (c = 0; c < last_good_copy; c++) { mc = &mm->mm_child[c]; if (mc->mc_error || !mc->mc_tried) continue; if (abd_cmp(mc->mc_abd, best_abd) != 0) zio->io_error = SET_ERROR(ECKSUM); /* * The distributed spare is always prefered * by vdev_mirror_child_select() so it's * considered to be the best candidate. */ if (pref_abd == NULL && mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) pref_abd = mc->mc_abd; /* * In the absence of a preferred copy, use * the parent pointer to avoid a memory copy. */ if (mc->mc_abd == zio->io_abd) best_abd = mc->mc_abd; } if (pref_abd) best_abd = pref_abd; } else { /* * If we have a BP available, then checksums are * already verified and we just need a buffer * with valid data, preferring parent one to * avoid a memory copy. */ for (c = 0; c < last_good_copy; c++) { mc = &mm->mm_child[c]; if (mc->mc_error || !mc->mc_tried) continue; if (mc->mc_abd == zio->io_abd) { best_abd = mc->mc_abd; break; } } } if (best_abd && best_abd != zio->io_abd) abd_copy(zio->io_abd, best_abd, zio->io_size); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_abd != zio->io_abd) abd_free(mc->mc_abd); mc->mc_abd = NULL; } } if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); } if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. * Not only is it unnecessary, it could * actually be harmful: if the system lost * power while rewriting the only good copy, * there would be no good copies left! */ mc = &mm->mm_child[c]; if (mc->mc_error == 0) { vdev_ops_t *ops = mc->mc_vd->vdev_ops; if (mc->mc_tried) continue; /* * We didn't try this child. We need to * repair it if: * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - * 2. it's an indirect or distributed spare * vdev (in which case it could point to any * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && ops != &vdev_indirect_ops && ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = SET_ERROR(ESTALE); } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted == vd->vdev_children) { if (vdev_children_are_offline(vd)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, VDEV_AUX_CHILDREN_OFFLINE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); } } else if (degraded + faulted != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } } /* * Return the maximum asize for a rebuild zio in the provided range. */ static uint64_t vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t max_segment) { (void) start; uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); return (MIN(asize, vdev_psize_to_asize(vd, psize))); } vdev_ops_t vdev_mirror_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, "Rotating media load increment for non-seeking I/Os"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, "Rotating media load increment for seeking I/Os"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, "Offset in bytes from the last I/O which triggers " "a reduced rotating media seek increment"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, "Non-rotating media load increment for seeking I/Os"); diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index 89786a1dfd96..c62faef2d05c 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -1,131 +1,133 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ /* * The 'missing' vdev is a special vdev type used only during import. It * signifies a placeholder in the root vdev for some vdev that we know is * missing. We pass it down to the kernel to allow the rest of the * configuration to parsed and an attempt made to open all available devices. * Because its GUID is always 0, we know that the guid sum will mismatch and we * won't be able to open the pool anyway. */ #include #include #include #include #include static int vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift, uint64_t *pshift) { /* * Really this should just fail. But then the root vdev will be in the * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we * will fail the GUID sum check before ever trying to open the pool. */ (void) vd; *psize = 0; *max_psize = 0; *ashift = 0; *pshift = 0; return (0); } static void vdev_missing_close(vdev_t *vd) { (void) vd; } static void vdev_missing_io_start(zio_t *zio) { zio->io_error = SET_ERROR(ENOTSUP); zio_execute(zio); } static void vdev_missing_io_done(zio_t *zio) { (void) zio; } vdev_ops_t vdev_missing_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b62dc6b0b91c..62d9c9909bd1 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1,5124 +1,5152 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } /* * Big Theory Statement for how a RAIDZ VDEV is expanded * * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs * that have been previously expanded can be expanded again. * * The RAIDZ VDEV must be healthy (must be able to write to all the drives in * the VDEV) when an expansion starts. And the expansion will pause if any * disk in the VDEV fails, and resume once the VDEV is healthy again. All other * operations on the pool can continue while an expansion is in progress (e.g. * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, * and zpool initialize which can't be run during an expansion. Following a * reboot or export/import, the expansion resumes where it left off. * * == Reflowing the Data == * * The expansion involves reflowing (copying) the data from the current set * of disks to spread it across the new set which now has one more disk. This * reflow operation is similar to reflowing text when the column width of a * text editor window is expanded. The text doesn’t change but the location of * the text changes to accommodate the new width. An example reflow result for * a 4-wide RAIDZ1 to a 5-wide is shown below. * * Reflow End State * Each letter indicates a parity group (logical stripe) * * Before expansion After Expansion * D1 D2 D3 D4 D1 D2 D3 D4 D5 * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | A | A | A | A | | A | A | A | A | B | * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | B | B | C | C | | B | C | C | C | C | * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | C | C | D | D | | D | D | E | E | E | * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | E | E | E | E | --> | E | F | F | G | G | * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | F | F | G | G | | G | G | H | H | H | * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | G | G | H | H | | H | I | I | J | J | * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | H | H | I | I | | J | J | | | K | * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| * +------+------+------+------+ +------+------+------+------+------+ * * This reflow approach has several advantages. There is no need to read or * modify the block pointers or recompute any block checksums. The reflow * doesn’t need to know where the parity sectors reside. We can read and write * data sequentially and the copy can occur in a background thread in open * context. The design also allows for fast discovery of what data to copy. * * The VDEV metaslabs are processed, one at a time, to copy the block data to * have it flow across all the disks. The metaslab is disabled for allocations * during the copy. As an optimization, we only copy the allocated data which * can be determined by looking at the metaslab range tree. During the copy we * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still * need to be able to survive losing parity count disks). This means we * cannot overwrite data during the reflow that would be needed if a disk is * lost. * * After the reflow completes, all newly-written blocks will have the new * layout, i.e., they will have the parity to data ratio implied by the new * number of disks in the RAIDZ group. Even though the reflow copies all of * the allocated space (data and parity), it is only rearranged, not changed. * * This act of reflowing the data has a few implications about blocks * that were written before the reflow completes: * * - Old blocks will still use the same amount of space (i.e., they will have * the parity to data ratio implied by the old number of disks in the RAIDZ * group). * - Reading old blocks will be slightly slower than before the reflow, for * two reasons. First, we will have to read from all disks in the RAIDZ * VDEV, rather than being able to skip the children that contain only * parity of this block (because the data of a single block is now spread * out across all the disks). Second, in most cases there will be an extra * bcopy, needed to rearrange the data back to its original layout in memory. * * == Scratch Area == * * As we copy the block data, we can only progress to the point that writes * will not overlap with blocks whose progress has not yet been recorded on * disk. Since partially-copied rows are always read from the old location, * we need to stop one row before the sector-wise overlap, to prevent any * row-wise overlap. For example, in the diagram above, when we reflow sector * B6 it will overwite the original location for B5. * * To get around this, a scratch space is used so that we can start copying * without risking data loss by overlapping the row. As an added benefit, it * improves performance at the beginning of the reflow, but that small perf * boost wouldn't be worth the complexity on its own. * * Ideally we want to copy at least 2 * (new_width)^2 so that we have a * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice * the widths will likely be single digits so we can get a substantial chuck * size using only a few MB of scratch per disk. * * The scratch area is persisted to disk which holds a large amount of reflowed * state. We can always read the partially written stripes when a disk fails or * the copy is interrupted (crash) during the initial copying phase and also * get past a small chunk size restriction. At a minimum, the scratch space * must be large enough to get us to the point that one row does not overlap * itself when moved (i.e new_width^2). But going larger is even better. We * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels * as our scratch space to handle overwriting the initial part of the VDEV. * * 0 256K 512K 4M * +------+------+-----------------------+----------------------------- * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... * | L0 | L1 | Reserved | (Metaslabs) * +------+------+-----------------------+------------------------------- * Scratch Area * * == Reflow Progress Updates == * After the initial scratch-based reflow, the expansion process works * similarly to device removal. We create a new open context thread which * reflows the data, and periodically kicks off sync tasks to update logical * state. In this case, state is the committed progress (offset of next data * to copy). We need to persist the completed offset on disk, so that if we * crash we know which format each VDEV offset is in. * * == Time Dependent Geometry == * * In non-expanded RAIDZ, blocks are read from disk in a column by column * fashion. For a multi-row block, the second sector is in the first column * not in the second column. This allows us to issue full reads for each * column directly into the request buffer. The block data is thus laid out * sequentially in a column-by-column fashion. * * For example, in the before expansion diagram above, one logical block might * be sectors G19-H26. The parity is in G19,H23; and the data is in * G20,H24,G21,H25,G22,H26. * * After a block is reflowed, the sectors that were all in the original column * data can now reside in different columns. When reading from an expanded * VDEV, we need to know the logical stripe width for each block so we can * reconstitute the block’s data after the reads are completed. Likewise, * when we perform the combinatorial reconstruction we need to know the * original width so we can retry combinations from the past layouts. * * Time dependent geometry is what we call having blocks with different layouts * (stripe widths) in the same VDEV. This time-dependent geometry uses the * block’s birth time (+ the time expansion ended) to establish the correct * width for a given block. After an expansion completes, we record the time * for blocks written with a particular width (geometry). * * == On Disk Format Changes == * * New pool feature flag, 'raidz_expansion' whose reference count is the number * of RAIDZ VDEVs that have been expanded. * * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. * * Since the uberblock can point to arbitrary blocks, which might be on the * expanding RAIDZ, and might or might not have been expanded. We need to know * which way a block is laid out before reading it. This info is the next * offset that needs to be reflowed and we persist that in the uberblock, in * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. * After the expansion is complete, we then use the raidz_expand_txgs array * (see below) to determine how to read a block and the ub_raidz_reflow_info * field no longer required. * * The uberblock's ub_raidz_reflow_info field also holds the scratch space * state (i.e., active or not) which is also required before reading a block * during the initial phase of reflowing the data. * * The top-level RAIDZ VDEV has two new entries in the nvlist: * * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here * and used after the expansion is complete to * determine how to read a raidz block * 'raidz_expanding' boolean: present during reflow and removed after completion * used during a spa import to resume an unfinished * expansion * * And finally the VDEVs top zap adds the following informational entries: * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED */ /* * For testing only: pause the raidz expansion after reflowing this amount. * (accessed by ZTS and ztest) */ #ifdef _KERNEL static #endif /* _KERNEL */ unsigned long raidz_expand_max_reflow_bytes = 0; /* * For testing only: pause the raidz expansion at a certain point. */ uint_t raidz_expand_pause_point = 0; /* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; #else static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; #endif /* * Apply raidz map abds aggregation if the number of rows in the map is equal * or greater than the value below. */ static unsigned long raidz_io_aggregate_rows = 4; /* * Automatically start a pool scrub when a RAIDZ expansion completes in * order to verify the checksums of all blocks which have been copied * during the expansion. Automatic scrubbing is enabled by default and * is strongly recommended. */ static int zfs_scrub_after_expand = 1; static void vdev_raidz_row_free(raidz_row_t *rr) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size != 0) abd_free(rc->rc_abd); if (rc->rc_orig_data != NULL) abd_free(rc->rc_orig_data); } if (rr->rr_abd_empty != NULL) abd_free(rr->rr_abd_empty); kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); } void vdev_raidz_map_free(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); if (rm->rm_nphys_cols) { for (int i = 0; i < rm->rm_nphys_cols; i++) { if (rm->rm_phys_col[i].rc_abd != NULL) abd_free(rm->rm_phys_col[i].rc_abd); } kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); } ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; vdev_raidz_map_free(rm); } static int vdev_raidz_reflow_compare(const void *x1, const void *x2) { const reflow_node_t *l = x1; const reflow_node_t *r = x2; return (TREE_CMP(l->re_txg, r->re_txg)); } const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; raidz_row_t * vdev_raidz_row_alloc(int cols, zio_t *zio) { raidz_row_t *rr = kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); rr->rr_cols = cols; rr->rr_scols = cols; for (int c = 0; c < cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_shadow_devidx = INT_MAX; rc->rc_shadow_offset = UINT64_MAX; /* * We can not allow self healing to take place for Direct I/O * reads. There is nothing that stops the buffer contents from * being manipulated while the I/O is in flight. It is possible * that the checksum could be verified on the buffer and then * the contents of that buffer are manipulated afterwards. This * could lead to bad data being written out during self * healing. */ if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) rc->rc_allow_repair = 1; } return (rr); } static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { int c; int nwrapped = 0; uint64_t off = 0; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(rm->rm_nrows, ==, 1); /* * Pad any parity columns with additional space to account for skip * sectors. */ if (rm->rm_skipstart < rr->rr_firstdatacol) { ASSERT0(rm->rm_skipstart); nwrapped = rm->rm_nskip; } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { nwrapped = (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; } /* * Optional single skip sectors (rc_size == 0) will be handled in * vdev_raidz_io_start_write(). */ int skipped = rr->rr_scols - rr->rr_cols; /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * Parity columns will pad out a linear ABD to account for * the skip sector. A linear ABD is used here because * parity calculations use the ABD buffer directly to calculate * parity. This avoids doing a memcpy back to the ABD after the * parity has been calculated. By issuing the parity column * with the skip sector we can reduce contention on the child * VDEV queue locks (vq_lock). */ if (c < nwrapped) { rc->rc_abd = abd_alloc_linear( rc->rc_size + (1ULL << ashift), B_FALSE); abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); skipped++; } else { rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } } for (off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); /* * Generate I/O for skip sectors to improve aggregation * continuity. We will use gang ABD's to reduce contention * on the child VDEV queue locks (vq_lock) by issuing * a single I/O that contains the data and skip sector. * * It is important to make sure that rc_size is not updated * even though we are adding a skip sector to the ABD. When * calculating the parity in vdev_raidz_generate_parity_row() * the rc_size is used to iterate through the ABD's. We can * not have zero'd out skip sectors used for calculating * parity for raidz, because those same sectors are not used * during reconstruction. */ if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(1ULL << ashift), B_TRUE); skipped++; } else { rc->rc_abd = abd; } off += rc->rc_size; } ASSERT3U(off, ==, zio->io_size); ASSERT3S(skipped, ==, rm->rm_nskip); } static void vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) { int c; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(rm->rm_nrows, ==, 1); /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); for (uint64_t off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); off += rc->rc_size; } } /* * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. * * Avoid inlining the function to keep vdev_raidz_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); rm->rm_nrows = 1; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. * scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rr = vdev_raidz_row_alloc(scols, zio); rm->rm_row[0] = rr; rr->rr_cols = acols; rr->rr_bigcols = bc; rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif uint64_t asize = 0; for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; uint64_t col = f + c; uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; if (c >= acols) rc->rc_size = 0; else if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_map_alloc_write(zio, rm, ashift); } else { vdev_raidz_map_alloc_read(zio, rm); } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } /* * Everything before reflow_offset_synced should have been moved to the new * location (read and write completed). However, this may not yet be reflected * in the on-disk format (e.g. raidz_reflow_sync() has been called but the * uberblock has not yet been written). If reflow is not in progress, * reflow_offset_synced should be UINT64_MAX. For each row, if the row is * entirely before reflow_offset_synced, it will come from the new location. * Otherwise this row will come from the old location. Therefore, rows that * straddle the reflow_offset_synced will come from the old location. * * For writes, reflow_offset_next is the next offset to copy. If a sector has * been copied, but not yet reflected in the on-disk progress * (reflow_offset_synced), it will also be written to the new (already copied) * offset. */ noinline raidz_map_t * vdev_raidz_map_alloc_expanded(zio_t *zio, uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, uint64_t nparity, uint64_t reflow_offset_synced, uint64_t reflow_offset_next, boolean_t use_scratch) { abd_t *abd = zio->io_abd; uint64_t offset = zio->io_offset; uint64_t size = zio->io_size; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> ashift; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. * AKA "full rows" */ uint64_t q = s / (logical_cols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (logical_cols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* How many rows contain data (not skip) */ uint64_t rows = howmany(tot, logical_cols); int cols = MIN(tot, logical_cols); raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), KM_SLEEP); rm->rm_nrows = rows; rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; uint64_t asize = 0; for (uint64_t row = 0; row < rows; row++) { boolean_t row_use_scratch = B_FALSE; raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); rm->rm_row[row] = rr; /* The starting RAIDZ (parent) vdev sector of the row. */ uint64_t b = (offset >> ashift) + row * logical_cols; /* * If we are in the middle of a reflow, and the copying has * not yet completed for any part of this row, then use the * old location of this row. Note that reflow_offset_synced * reflects the i/o that's been completed, because it's * updated by a synctask, after zio_wait(spa_txg_zio[]). * This is sufficient for our check, even if that progress * has not yet been recorded to disk (reflected in * spa_ubsync). Also note that we consider the last row to * be "full width" (`cols`-wide rather than `bc`-wide) for * this calculation. This causes a tiny bit of unnecessary * double-writes but is safe and simpler to calculate. */ int row_phys_cols = physical_cols; if (b + cols > reflow_offset_synced >> ashift) row_phys_cols--; else if (use_scratch) row_use_scratch = B_TRUE; /* starting child of this row */ uint64_t child_id = b % row_phys_cols; /* The starting byte offset on each child vdev. */ uint64_t child_offset = (b / row_phys_cols) << ashift; /* * Note, rr_cols is the entire width of the block, even * if this row is shorter. This is needed because parity * generation (for Q and R) needs to know the entire width, * because it treats the short row as though it was * full-width (and the "phantom" sectors were zero-filled). * * Another approach to this would be to set cols shorter * (to just the number of columns that we might do i/o to) * and have another mechanism to tell the parity generation * about the "entire width". Reconstruction (at least * vdev_raidz_reconstruct_general()) would also need to * know about the "entire width". */ rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG /* * note: rr_size is PSIZE, not ASIZE */ rr->rr_offset = b << ashift; rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; #endif for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { child_id -= row_phys_cols; child_offset += 1ULL << ashift; } raidz_col_t *rc = &rr->rr_col[c]; rc->rc_devidx = child_id; rc->rc_offset = child_offset; /* * Get this from the scratch space if appropriate. * This only happens if we crashed in the middle of * raidz_reflow_scratch_sync() (while it's running, * the rangelock prevents us from doing concurrent * io), and even then only during zpool import or * when the pool is imported readonly. */ if (row_use_scratch) rc->rc_offset -= VDEV_BOOT_SIZE; uint64_t dc = c - rr->rr_firstdatacol; if (c < rr->rr_firstdatacol) { rc->rc_size = 1ULL << ashift; /* * Parity sectors' rc_abd's are set below * after determining if this is an aggregation. */ } else if (row == rows - 1 && bc != 0 && c >= bc) { /* * Past the end of the block (even including * skip sectors). This sector is part of the * map so that we have full rows for p/q parity * generation. */ rc->rc_size = 0; rc->rc_abd = NULL; } else { /* "data column" (col excluding parity) */ uint64_t off; if (c < bc || r == 0) { off = dc * rows + row; } else { off = r * rows + (dc - r) * (rows - 1) + row; } rc->rc_size = 1ULL << ashift; rc->rc_abd = abd_get_offset_struct( &rc->rc_abdstruct, abd, off << ashift, rc->rc_size); } if (rc->rc_size == 0) continue; /* * If any part of this row is in both old and new * locations, the primary location is the old * location. If this sector was already copied to the * new location, we need to also write to the new, * "shadow" location. * * Note, `row_phys_cols != physical_cols` indicates * that the primary location is the old location. * `b+c < reflow_offset_next` indicates that the copy * to the new location has been initiated. We know * that the copy has completed because we have the * rangelock, which is held exclusively while the * copy is in progress. */ if (row_use_scratch || (row_phys_cols != physical_cols && b + c < reflow_offset_next >> ashift)) { rc->rc_shadow_devidx = (b + c) % physical_cols; rc->rc_shadow_offset = ((b + c) / physical_cols) << ashift; if (row_use_scratch) rc->rc_shadow_offset -= VDEV_BOOT_SIZE; } asize += rc->rc_size; } /* * See comment in vdev_raidz_map_alloc() */ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && (offset & (1ULL << 20))) { ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); int devidx0 = rr->rr_col[0].rc_devidx; uint64_t offset0 = rr->rr_col[0].rc_offset; int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; uint64_t shadow_offset0 = rr->rr_col[0].rc_shadow_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[0].rc_shadow_devidx = rr->rr_col[1].rc_shadow_devidx; rr->rr_col[0].rc_shadow_offset = rr->rr_col[1].rc_shadow_offset; rr->rr_col[1].rc_devidx = devidx0; rr->rr_col[1].rc_offset = offset0; rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; rr->rr_col[1].rc_shadow_offset = shadow_offset0; } } ASSERT3U(asize, ==, tot << ashift); /* * Determine if the block is contiguous, in which case we can use * an aggregation. */ if (rows >= raidz_io_aggregate_rows) { rm->rm_nphys_cols = physical_cols; rm->rm_phys_col = kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, KM_SLEEP); /* * Determine the aggregate io's offset and size, and check * that the io is contiguous. */ for (int i = 0; i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; if (rc->rc_size == 0) continue; if (prc->rc_size == 0) { ASSERT0(prc->rc_offset); prc->rc_offset = rc->rc_offset; } else if (prc->rc_offset + prc->rc_size != rc->rc_offset) { /* * This block is not contiguous and * therefore can't be aggregated. * This is expected to be rare, so * the cost of allocating and then * freeing rm_phys_col is not * significant. */ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); rm->rm_phys_col = NULL; rm->rm_nphys_cols = 0; break; } prc->rc_size += rc->rc_size; } } } if (rm->rm_phys_col != NULL) { /* * Allocate aggregate ABD's. */ for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; prc->rc_devidx = i; if (prc->rc_size == 0) continue; prc->rc_abd = abd_alloc_linear(rm->rm_phys_col[i].rc_size, B_FALSE); } /* * Point the parity abd's into the aggregate abd's. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, prc->rc_abd, rc->rc_offset - prc->rc_offset, rc->rc_size); } } } else { /* * Allocate new abd's for the parity sectors. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_alloc_linear(rc->rc_size, B_TRUE); } } } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } struct pqr_struct { uint64_t *p; uint64_t *q; uint64_t *r; }; static int vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); } static int vdev_raidz_pq_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; } return (0); } static int vdev_raidz_pqr_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; VDEV_RAIDZ_64MUL_4(*pqr->r, mask); *pqr->r ^= *src; } return (0); } static void vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_R].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); (void) memcpy(r, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { if (rr->rr_cols == 0) { /* * We are handling this block one row at a time (because * this block has a different logical vs physical width, * due to RAIDZ expansion), and this is a pad-only row, * which has no parity. */ return; } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; switch (rr->rr_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rr); break; case 2: vdev_raidz_generate_parity_pq(rr); break; case 3: vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } void vdev_raidz_generate_parity(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_generate_parity_row(rm, rr); } } static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); for (int i = 0; i < cnt; i++) { dst[i] ^= src[i]; } return (0); } static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } return (0); } static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { (void) private; uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++) { /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ VDEV_RAIDZ_64MUL_2(*dst, mask); } return (0); } struct reconst_q_struct { uint64_t *q; int exp; }; static int vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) { struct reconst_q_struct *rq = private; uint64_t *dst = buf; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { int j; uint8_t *b; *dst ^= *rq->q; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, rq->exp); } } return (0); } struct reconst_pq_struct { uint8_t *p; uint8_t *q; uint8_t *pxy; uint8_t *qxy; int aexp; int bexp; }; static int vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; uint8_t *yd = ybuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); *yd = *rpq->p ^ *rpq->pxy ^ *xd; } return (0); } static int vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { /* same operation as vdev_raidz_reconst_pq_func() on xd */ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); } return (0); } static void vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; dst = rr->rr_col[x].rc_abd; abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; if (c == x) continue; (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } } static void vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; dst = rr->rr_col[x].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, rr->rr_col[x].rc_size - size); } } else { ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; dst = rr->rr_col[x].rc_abd; exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); } static void vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; abd_t *xd, *yd; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); ASSERT(y < rr->rr_cols); ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; xsize = rr->rr_col[x].rc_size; ysize = rr->rr_col[y].rc_size; rr->rr_col[VDEV_RAIDZ_P].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); rr->rr_col[VDEV_RAIDZ_Q].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rr->rr_col[x].rc_size = 0; rr->rr_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rr); rr->rr_col[x].rc_size = xsize; rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); xd = rr->rr_col[x].rc_abd; yd = rr->rr_col[y].rc_abd; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ASSERT3U(xsize, >=, ysize); struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; (void) abd_iterate_func2(xd, yd, 0, 0, ysize, vdev_raidz_reconst_pq_func, &rpq); (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; } /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ static void vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rr->rr_firstdatacol); jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rr->rr_cols); ccount = rr->rr_col[c].rc_size; ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); if (ccount == 0) continue; src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { cc = missing[j] + rr->rr_firstdatacol; ASSERT3U(cc, >=, rr->rr_firstdatacol); ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); dcount[j] = rr->rr_col[cc].rc_size; if (dcount[j] != 0) dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int i, c, t, tt; unsigned int n; unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; bufs[c] = col->rc_abd; if (bufs[c] != NULL) { col->rc_abd = abd_alloc_linear( col->rc_size, B_TRUE); abd_copy(col->rc_abd, bufs[c], col->rc_size); } } break; } } n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rr->rr_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } parity_map[i] = c; i++; } psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); /* * copy back from temporary linear abds and free them */ if (bufs) { for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; if (bufs[c] != NULL) { abd_copy(bufs[c], col->rc_abd, col->rc_size); abd_free(col->rc_abd); } col->rc_abd = bufs[c]; } kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } } static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity); } nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " "offset=%llx error=%u)", rr, c, (int)rr->rr_col[c].rc_devidx, (long long)rr->rr_col[c].rc_offset, (int)rr->rr_col[c].rc_error); } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) { vdev_raidz_reconstruct_p(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_q(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 2); break; case 2: ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_pq(rr, dt, 2); return; } ASSERT(rr->rr_firstdatacol > 2); break; } vdev_raidz_reconstruct_general(rr, tgts, ntgts); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) continue; *physical_ashift = vdev_best_ashift(*logical_ashift, *physical_ashift, cvd->vdev_physical_ashift); } if (vd->vdev_rz_expanding) { *asize *= vd->vdev_children - 1; *max_asize *= vd->vdev_children - 1; vd->vdev_min_asize = *asize; } else { *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } /* * Return the logical width to use, given the txg in which the allocation * happened. Note that BP_GET_BIRTH() is usually the txg in which the * BP was allocated. Remapped BP's (that were relocated due to device * removal, see remap_blkptr_cb()), will have a more recent physical birth * which reflects when the BP was relocated, but we can ignore these because * they can't be on RAIDZ (device removal doesn't support RAIDZ). */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) { reflow_node_t lookup = { .re_txg = txg, }; avl_index_t where; uint64_t width; mutex_enter(&vdrz->vd_expand_lock); reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); if (re != NULL) { width = re->re_logical_width; } else { re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); if (re != NULL) width = re->re_logical_width; else width = vdrz->vd_original_width; } mutex_exit(&vdrz->vd_expand_lock); return (width); } +/* + * This code converts an asize into the largest psize that can safely be written + * to an allocation of that size for this vdev. + * + * Note that this function will not take into account the effect of gang + * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of + * the psize_to_asize function. + */ +static uint64_t +vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t psize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vdrz->vd_original_width; + uint64_t nparity = vdrz->vd_nparity; + + cols = vdev_raidz_get_logical_width(vdrz, txg); + + ASSERT0(asize % (1 << ashift)); + + psize = (asize >> ashift); + psize -= nparity * DIV_ROUND_UP(psize, cols); + psize <<= ashift; + + return (asize); +} /* * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated * more space due to the lower data-to-parity ratio. In this case it's * important to pass in the correct txg. Note that vdev_gang_header_asize() * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, * regardless of txg. This is assured because for a single data sector, we * allocate P+1 sectors regardless of width ("cols", which is at least P+1). */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; cols = vdev_raidz_get_logical_width(vdrz, txg); asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; #ifdef ZFS_DEBUG uint64_t asize_new = ((psize - 1) >> ashift) + 1; uint64_t ncols_new = vdrz->vd_physical_width; asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / (ncols_new - nparity)); asize_new = roundup(asize_new, nparity + 1) << ashift; VERIFY3U(asize_new, <=, asize); #endif return (asize); } /* * The allocatable space for a raidz vdev is N * sizeof(smallest child) * so each child must provide at least 1/Nth of its asize. */ static uint64_t vdev_raidz_min_asize(vdev_t *vd) { return ((vd->vdev_min_asize + vd->vdev_children - 1) / vd->vdev_children); } void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; ASSERT3P(rc->rc_abd, !=, NULL); rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_shadow_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_shadow_error = zio->io_error; } static void vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) { (void) rm; #ifdef ZFS_DEBUG zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, rr->rr_size, + vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); if (vdev_xlate_is_empty(&physical_rs)) { /* * If we are in the middle of expansion, the * physical->logical mapping is changing so vdev_xlate() * can't give us a reliable answer. */ return; } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* * It would be nice to assert that rs_end is equal * to rc_offset + rc_size but there might be an * optional I/O at the end that is not accounted in * rc_size. */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } #endif } static void vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ vdev_raidz_io_verify(zio, rm, rr, c); if (rc->rc_size == 0) continue; ASSERT3U(rc->rc_offset + rc->rc_size, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3P(rc->rc_abd, !=, NULL); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); if (rc->rc_shadow_devidx != INT_MAX) { vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; ASSERT3U( rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, cvd2->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, rc->rc_shadow_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_shadow_child_done, rc)); } } } /* * Generate optional I/Os for skip sectors to improve aggregation contiguity. * This only works for vdev_raidz_map_alloc() (not _expanded()). */ static void raidz_start_skip_writes(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t ashift = vd->vdev_top->vdev_ashift; raidz_map_t *rm = zio->io_vsd; ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_size != 0) continue; ASSERT3P(rc->rc_abd, ==, NULL); ASSERT3U(rc->rc_offset, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } } static void vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } static void vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) { vdev_t *vd = zio->io_vd; for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; if (prc->rc_size == 0) continue; ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ prc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { prc->rc_error = SET_ERROR(ESTALE); prc->rc_skipped = 1; continue; } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, prc->rc_offset, prc->rc_abd, prc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, prc)); } } static void vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) { /* * If there are multiple rows, we will be hitting * all disks, so go ahead and read the parity so * that we are reading in decent size chunks. */ boolean_t forceparity = rm->rm_nrows > 1; if (rm->rm_phys_col) { vdev_raidz_io_start_read_phys_cols(zio, rm); } else { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_start_read_row(zio, rr, forceparity); } } } /* * Start an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. * 3. If the column skips any sectors for padding, create optional dummy * write zio children for those areas to improve aggregation continuity. * - For read operations: * 1. Create child zio read operations to each data column's vdev to read * the range of data required for zio. * 2. If this is a scrub or resilver operation, or if any of the data * vdevs have had errors, then create zio read operations to the parity * columns' VDevs as well. */ static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, BP_GET_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; uint64_t next_offset = UINT64_MAX; boolean_t use_scratch = B_FALSE; /* * Note: when the expansion is completing, we set * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) * in a later txg than when we last update spa_ubsync's state * (see the end of spa_raidz_expand_thread()). Therefore we * may see vre_state!=SCANNING before * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected * on disk, but the copying progress has been synced to disk * (and reflected in spa_ubsync). In this case it's fine to * treat the expansion as completed, since if we crash there's * no additional copying to do. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) { ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, &vdrz->vn_vre); lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, zio->io_offset, zio->io_size, RL_READER); use_scratch = (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == RRSS_SCRATCH_VALID); synced_offset = RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); next_offset = vdrz->vn_vre.vre_offset; /* * If we haven't resumed expanding since importing the * pool, vre_offset won't have been set yet. In * this case the next offset to be copied is the same * as what was synced. */ if (next_offset == UINT64_MAX) { next_offset = synced_offset; } } if (use_scratch) { zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" "%lld next_offset=%lld use_scratch=%u", zio, zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", (long long)zio->io_offset, (long long)synced_offset, (long long)next_offset, use_scratch); } rm = vdev_raidz_map_alloc_expanded(zio, tvd->vdev_ashift, vdrz->vd_physical_width, logical_width, vdrz->vd_nparity, synced_offset, next_offset, use_scratch); rm->rm_lr = lr; } else { rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, logical_width, vdrz->vd_nparity); } rm->rm_original_width = vdrz->vd_original_width; zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_start_write(zio, rm->rm_row[i]); } if (logical_width == vdrz->vd_physical_width) { raidz_start_skip_writes(zio); } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ void vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level RAIDZ VDEV. */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); zio_checksum_verified(zio); return (0); } if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number of such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = rc->rc_abd; ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* * Verify any empty sectors are zero filled to ensure the parity * is calculated correctly even if these non-data sectors are damaged. */ if (rr->rr_nempty && rr->rr_abd_empty != NULL) ret += vdev_draid_map_verify_empty(zio, rr); /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ vdev_raidz_generate_parity_row(rm, rr); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { zfs_dbgmsg("found error on col=%u devidx=%u off %llx", c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } abd_free(orig[c]); } return (ret); } static int vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); } return (error); } static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } if (rc->rc_force_repair) unexpected_errors++; } /* * If we read more parity disks than were used for * reconstruction, confirm that the other parity disks produced * correct data. * * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!rc->rc_allow_repair) { continue; } else if (!rc->rc_force_repair && (rc->rc_error == 0 || rc->rc_size == 0)) { continue; } /* * We do not allow self healing for Direct I/O reads. * See comment in vdev_raid_row_alloc(). */ ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " "offset=%llx", zio, c, rc->rc_devidx, (long long)rc->rc_offset); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } /* * Scrub or resilver i/o's: overwrite any shadow locations with the * good data. This ensures that if we've already copied this sector, * it will be corrected if it was damaged. This writes more than is * necessary, but since expansion is paused during scrub/resilver, at * most a single row will have a shadow location. */ if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; /* * Note: We don't want to update the repair stats * because that would incorrectly indicate that there * was bad data to repair, which we aren't sure about. * By clearing the SCAN_THREAD flag, we prevent this * from happening, despite having the REPAIR flag set. * We need to set SELF_HEAL so that this i/o can't be * bypassed by zio_vdev_io_start(). */ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL); cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; zio_nowait(cio); } } } static void raidz_restore_orig_data(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { abd_copy(rc->rc_abd, rc->rc_orig_data, rc->rc_size); rc->rc_need_orig_restore = B_FALSE; } } } } /* * During raidz_reconstruct() for expanded VDEV, we need special consideration * failure simulations. See note in raidz_reconstruct() on simulating failure * of a pre-expansion device. * * Treating logical child i as failed, return TRUE if the given column should * be treated as failed. The idea of logical children allows us to imagine * that a disk silently failed before a RAIDZ expansion (reads from this disk * succeed but return the wrong data). Since the expansion doesn't verify * checksums, the incorrect data will be moved to new locations spread among * the children (going diagonally across them). * * Higher "logical child failures" (values of `i`) indicate these * "pre-expansion failures". The first physical_width values imagine that a * current child failed; the next physical_width-1 values imagine that a * child failed before the most recent expansion; the next physical_width-2 * values imagine a child failed in the expansion before that, etc. */ static boolean_t raidz_simulate_failure(int physical_width, int original_width, int ashift, int i, raidz_col_t *rc) { uint64_t sector_id = physical_width * (rc->rc_offset >> ashift) + rc->rc_devidx; for (int w = physical_width; w >= original_width; w--) { if (i < w) { return (sector_id % w == i); } else { i -= w; } } ASSERT(!"invalid logical child id"); return (B_FALSE); } /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction */ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ int t = 0; int dead = 0; int dead_data = 0; if (dbgmsg) zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); if (rc->rc_error != 0) { dead++; if (c >= nparity) dead_data++; continue; } if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { if (raidz_simulate_failure(physical_width, original_width, zio->io_vd->vdev_top->vdev_ashift, ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( rc->rc_size, B_TRUE); abd_copy(rc->rc_orig_data, rc->rc_abd, rc->rc_size); } rc->rc_need_orig_restore = B_TRUE; dead++; if (c >= nparity) dead_data++; /* * Note: simulating failure of a * pre-expansion device can hit more * than one column, in which case we * might try to simulate more failures * than can be reconstructed, which is * also more than the size of my_tgts. * This check prevents accessing past * the end of my_tgts. The "dead > * nparity" check below will fail this * reconstruction attempt. */ if (t < VDEV_RAIDZ_MAXPARITY) { my_tgts[t++] = c; if (dbgmsg) { zfs_dbgmsg("simulating " "failure of col %u " "devidx %u", c, (int)rc->rc_devidx); } } break; } } } if (dead > nparity) { /* reconstruction not possible */ if (dbgmsg) { zfs_dbgmsg("reconstruction not possible; " "too many failures"); } raidz_restore_orig_data(rm); return (EINVAL); } if (dead_data > 0) vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); } /* Check for success */ if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) return (0); /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { /* * Note: if this is a parity column, * we don't really know if it's wrong. * We need to let * vdev_raidz_io_done_verified() check * it, and if we set rc_error, it will * think that it is a "known" error * that doesn't need to be checked * or corrected. */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { vdev_raidz_checksum_error(zio, rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); } rc->rc_need_orig_restore = B_FALSE; } } vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); if (dbgmsg) { zfs_dbgmsg("reconstruction successful " "(checksum verified)"); } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " "failed", zio); } return (ECKSUM); } /* * Iterate over all combinations of N bad vdevs and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. * * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose * 3 columns to reconstruct), we will generate the following sequence: * * STATE ACTION * 0 1 2 special case: skip since these are all parity * 0 1 3 first slot: reset to 0; middle slot: increment to 2 * 0 2 3 first slot: increment to 1 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 * 0 1 4 first: reset to 0; middle: increment to 2 * 0 2 4 first: increment to 1 * 1 2 4 first: reset to 0; middle: increment to 3 * 0 3 4 first: increment to 1 * 1 3 4 first: increment to 2 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 * 0 1 5 first: reset to 0; middle: increment to 2 * 0 2 5 first: increment to 1 * 1 2 5 first: reset to 0; middle: increment to 3 * 0 3 5 first: increment to 1 * 1 3 5 first: increment to 2 * 2 3 5 first: reset to 0; middle: increment to 4 * 0 4 5 first: increment to 1 * 1 4 5 first: increment to 2 * 2 4 5 first: increment to 3 * 3 4 5 done * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. * * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; for (int c = 0; c < rr->rr_cols; c++) { if (rr->rr_col[c].rc_error) total_errors++; } if (total_errors > nparity) return (vdev_raidz_worst_error(rr)); } for (int num_failures = 1; num_failures <= nparity; num_failures++) { int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ /* * Determine number of logical children, n. See comment * above raidz_simulate_failure(). */ int n = 0; for (int w = physical_width; w >= original_width; w--) { n += w; } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); /* Handle corner cases in combrec logic */ ltgts[-1] = -1; for (int i = 0; i < num_failures; i++) { ltgts[i] = i; } ltgts[num_failures] = n; for (;;) { int err = raidz_reconstruct(zio, ltgts, num_failures, nparity); if (err == EINVAL) { /* * Reconstruction not possible with this # * failures; try more failures. */ break; } else if (err == 0) return (0); /* Compute next targets to try */ for (int t = 0; ; t++) { ASSERT3U(t, <, num_failures); ltgts[t]++; if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruction " "failed for num_failures=" "%u; tried all " "combinations", num_failures); } break; } ASSERT3U(ltgts[t], <, n); ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) break; // found next combination /* * Otherwise, reset this tgt to the minimum, * and move on to the next tgt. */ ltgts[t] = ltgts[t - 1] + 1; ASSERT3U(ltgts[t], ==, t); } /* Increase the number of failures and keep trying. */ if (ltgts[num_failures - 1] == n) break; } } if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } void vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { for (uint64_t row = 0; row < rm->rm_nrows; row++) { raidz_row_t *rr = rm->rm_row[row]; vdev_raidz_reconstruct_row(rm, rr, t, nt); } } /* * Complete a write IO operation on a RAIDZ VDev * * Outline: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. */ static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { int normal_errors = 0; int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ normal_errors++; } if (rc->rc_shadow_error != 0) { ASSERT(rc->rc_shadow_error != ECKSUM); shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough * columns to reconstruct the data, the I/O failed. Otherwise, good * enough. Note that in the case of a shadow write (during raidz * expansion), depending on if we crash, either the normal (old) or * shadow (new) location may become the "real" version of the block, * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (normal_errors > rr->rr_firstdatacol || shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } } static void vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr) { int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * If scrubbing and a replacing/sparing child vdev determined * that not all of its children have an identical copy of the * data, then clear the error so the column is treated like * any other read and force a repair to correct the damage. */ if (rc->rc_error == ECKSUM) { ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); vdev_raidz_checksum_error(zio, rc, rc->rc_abd); rc->rc_force_repair = 1; rc->rc_error = 0; } if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; total_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * If there were data errors and the number of errors we saw was * correctable -- less than or equal to the number of parity disks read * -- reconstruct based on the missing data. */ if (data_errors != 0 && total_errors <= rr->rr_firstdatacol - parity_untried) { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rr->rr_firstdatacol); /* * Identify the data columns that reported an error. */ int n = 0; int tgts[VDEV_RAIDZ_MAXPARITY]; for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rr->rr_firstdatacol >= n); vdev_raidz_reconstruct_row(rm, rr, tgts, n); } } /* * Return the number of reads issued. */ static int vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; int nread = 0; rr->rr_missingdata = 0; rr->rr_missingparity = 0; /* * If this rows contains empty sectors which are not required * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); nread++; } return (nread); } /* * We're here because either there were too many errors to even attempt * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() * failed. In either case, there is enough bad data to prevent reconstruction. * Start checksum ereports for all children which haven't failed. */ static void vdev_raidz_io_done_unrecoverable(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; if (rc->rc_error != 0) continue; zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, &zbc); } } } void vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { if (rm->rm_phys_col) { /* * This is an aggregated read. Copy the data and status * from the aggregate abd's to the individual rows. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_error = prc->rc_error; rc->rc_tried = prc->rc_tried; rc->rc_skipped = prc->rc_skipped; if (c >= rr->rr_firstdatacol) { /* * Note: this is slightly faster * than using abd_copy_off(). */ char *physbuf = abd_to_buf( prc->rc_abd); void *physloc = physbuf + rc->rc_offset - prc->rc_offset; abd_copy_from_buf(rc->rc_abd, physloc, rc->rc_size); } } } } for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); } if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) goto done; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); } else { /* * A sequential resilver has no checksum which makes * combinatoral reconstruction impossible. This code * path is unreachable since raidz_checksum_verify() * has no checksum to verify and must succeed. */ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); /* * This isn't a typical situation -- either we got a * read error or a child silently returned bad data. * Read every block so we can try again with as much * data and parity as we can track down. If we've * already been through once before, all children will * be marked as tried so we'll proceed to combinatorial * reconstruction. */ int nread = 0; for (int i = 0; i < rm->rm_nrows; i++) { nread += vdev_raidz_read_all(zio, rm->rm_row[i]); } if (nread != 0) { /* * Normally our stage is VDEV_IO_DONE, but if * we've already called redone(), it will have * changed to VDEV_IO_START, in which case we * don't want to call redone() again. */ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) zio_vdev_io_redone(zio); return; } /* * It would be too expensive to try every possible * combination of failed sectors in every row, so * instead we try every combination of failed current or * past physical disk. This means that if the incorrect * sectors were all on Nparity disks at any point in the * past, we will find the correct data. The only known * case where this is less durable than a non-expanded * RAIDZ, is if we have a silent failure during * expansion. In that case, one block could be * partially in the old format and partially in the * new format, so we'd lost some sectors from the old * format and some from the new format. * * e.g. logical_width=4 physical_width=6 * the 15 (6+5+4) possible failed disks are: * width=6 child=0 * width=6 child=1 * width=6 child=2 * width=6 child=3 * width=6 child=4 * width=6 child=5 * width=5 child=0 * width=5 child=1 * width=5 child=2 * width=5 child=3 * width=5 child=4 * width=4 child=0 * width=4 child=1 * width=4 child=2 * width=4 child=3 * And we will try every combination of Nparity of these * failing. * * As a first pass, we can generate every combo, * and try reconstructing, ignoring any known * failures. If any row has too many known + simulated * failures, then we bail on reconstructing with this * number of simulated failures. As an improvement, * we could detect the number of whole known failures * (i.e. we have known failures on these disks for * every row; the disks never succeeded), and * subtract that from the max # failures to simulate. * We could go even further like the current * combrec code, but that doesn't seem like it * gains us very much. If we simulate a failure * that is also a known failure, that's fine. */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_raidz_io_done_unrecoverable(zio); } } } done: if (rm->rm_lr != NULL) { zfs_rangelock_exit(rm->rm_lr); rm->rm_lr = NULL; } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; /* * If we're in the middle of a RAIDZ expansion, this block may be in * the old and/or new location. For simplicity, always resilver it. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) return (B_TRUE); uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; /* Unreachable by sequential resilver. */ ASSERT3U(phys_birth, !=, TXG_UNKNOWN); if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (s + nparity >= dcols) return (B_TRUE); for (uint64_t c = 0; c < s + nparity; c++) { uint64_t devidx = (f + c) % dcols; vdev_t *cvd = vd->vdev_child[devidx]; /* * dsl_scan_need_resilver() already checked vd with * vdev_dtl_contains(). So here just check cvd with * vdev_dtl_empty(), cheaper and a good approximation. */ if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) return (B_TRUE); } return (B_FALSE); } static void vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) remain_rs; vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); vdev_raidz_t *vdrz = raidvd->vdev_tsd; if (vdrz->vn_vre.vre_state == DSS_SCANNING) { /* * We're in the middle of expansion, in which case the * translation is in flux. Any answer we give may be wrong * by the time we return, so it isn't safe for the caller to * act on it. Therefore we say that this range isn't present * on any children. The only consumers of this are "zpool * initialize" and trimming, both of which are "best effort" * anyway. */ physical_rs->rs_start = physical_rs->rs_end = 0; remain_rs->rs_start = remain_rs->rs_end = 0; return; } uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t b_start = logical_rs->rs_start >> ashift; uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ start_row = ((b_start - tgt_col - 1) / width) + 1; uint64_t end_row = 0; if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; physical_rs->rs_start = start_row << ashift; physical_rs->rs_end = end_row << ashift; ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_rs->rs_end - logical_rs->rs_start); } static void raidz_reflow_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * Ensure there are no i/os to the range that is being committed. */ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); mutex_enter(&vre->vre_lock); uint64_t new_offset = MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); /* * We should not have committed anything that failed. */ VERIFY3U(vre->vre_failed_offset, >=, old_offset); mutex_exit(&vre->vre_lock); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, old_offset, new_offset - old_offset, RL_WRITER); /* * Update the uberblock that will be written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); vre->vre_offset_pertxg[txgoff] = 0; zfs_rangelock_exit(lr); mutex_enter(&vre->vre_lock); vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; vre->vre_bytes_copied_pertxg[txgoff] = 0; mutex_exit(&vre->vre_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); } static void raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); vdev_raidz_t *vdrz = raidvd->vdev_tsd; for (int i = 0; i < TXG_SIZE; i++) VERIFY0(vre->vre_offset_pertxg[i]); reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; re->re_logical_width = vdrz->vd_physical_width; mutex_enter(&vdrz->vd_expand_lock); avl_add(&vdrz->vd_expand_txgs, re); mutex_exit(&vdrz->vd_expand_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); /* * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS * will get written (based on vd_expand_txgs). */ vdev_config_dirty(vd); /* * Before we change vre_state, the on-disk state must reflect that we * have completed all copying, so that vdev_raidz_io_start() can use * vre_state to determine if the reflow is in progress. See also the * end of spa_raidz_expand_thread(). */ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, raidvd->vdev_ms_count << raidvd->vdev_ms_shift); vre->vre_end_time = gethrestime_sec(); vre->vre_state = DSS_FINISHED; uint64_t state = vre->vre_state; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t end_time = vre->vre_end_time; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time, tx)); spa->spa_uberblock.ub_raidz_reflow_info = 0; spa_history_log_internal(spa, "raidz vdev expansion completed", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)vd->vdev_id, (unsigned long long)vd->vdev_children); spa->spa_raidz_expand = NULL; raidvd->vdev_rz_expanding = B_FALSE; spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); spa_notify_waiters(spa); /* * While we're in syncing context take the opportunity to * setup a scrub. All the data has been sucessfully copied * but we have not validated any checksums. */ setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; if (zfs_scrub_after_expand && dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { dsl_scan_setup_sync(&setup_sync_arg, tx); } } /* * State of one copy batch. */ typedef struct raidz_reflow_arg { vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ uint64_t rra_txg; /* TXG of this batch. */ uint_t rra_ashift; /* Ashift of the vdev. */ uint32_t rra_tbd; /* Number of in-flight ZIOs. */ uint32_t rra_writes; /* Number of write ZIOs. */ zio_t *rra_zio[]; /* Write ZIO pointers. */ } raidz_reflow_arg_t; /* * Write of the new location on one child is done. Once all of them are done * we can unlock and free everything. */ static void raidz_reflow_write_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; abd_free(zio->io_abd); mutex_enter(&vre->vre_lock); if (zio->io_error != 0) { /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); } ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); vre->vre_outstanding_bytes -= zio->io_size; if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < vre->vre_failed_offset) { vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += zio->io_size; } cv_signal(&vre->vre_cv); boolean_t done = (--rra->rra_tbd == 0); mutex_exit(&vre->vre_lock); if (!done) return; spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); } /* * Read of the old location on one child is done. Once all of them are done * writes should have all the data and we can issue them. */ static void raidz_reflow_read_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; /* Reads of only one block use write ABDs. For bigger free gangs. */ if (zio->io_size > (1 << rra->rra_ashift)) abd_free(zio->io_abd); /* * If the read failed, or if it was done on a vdev that is not fully * healthy (e.g. a child that has a resilver in progress), we may not * have the correct data. Note that it's OK if the write proceeds. * It may write garbage but the location is otherwise unused and we * will retry later due to vre_failed_offset. */ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_lr->lr_length, (long long)rra->rra_txg, zio->io_error, vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), vdev_dtl_empty(zio->io_vd, DTL_MISSING)); mutex_enter(&vre->vre_lock); /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); mutex_exit(&vre->vre_lock); } if (atomic_dec_32_nv(&rra->rra_tbd) > 0) return; uint32_t writes = rra->rra_tbd = rra->rra_writes; for (uint64_t i = 0; i < writes; i++) zio_nowait(rra->rra_zio[i]); } static void raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (offset == 0) return; mutex_enter(&vre->vre_lock); ASSERT3U(vre->vre_offset, <=, offset); vre->vre_offset = offset; mutex_exit(&vre->vre_lock); if (vre->vre_offset_pertxg[txgoff] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, spa, tx); } vre->vre_offset_pertxg[txgoff] = offset; } static boolean_t vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) { for (int i = 0; i < raidz_vd->vdev_children; i++) { /* Quick check if a child is being replaced */ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) return (B_TRUE); } return (B_FALSE); } static boolean_t raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint_t ashift = vd->vdev_top->vdev_ashift; zfs_range_seg_t *rs = zfs_range_tree_first(rt); if (rt == NULL) return (B_FALSE); uint64_t offset = zfs_rs_get_start(rs, rt); ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); uint64_t size = zfs_rs_get_end(rs, rt) - offset; ASSERT3U(size, >=, 1 << ashift); ASSERT(IS_P2ALIGNED(size, 1 << ashift)); uint64_t blkid = offset >> ashift; uint_t old_children = vd->vdev_children - 1; /* * We can only progress to the point that writes will not overlap * with blocks whose progress has not yet been recorded on disk. * Since partially-copied rows are still read from the old location, * we need to stop one row before the sector-wise overlap, to prevent * row-wise overlap. * * Note that even if we are skipping over a large unallocated region, * we can't move the on-disk progress to `offset`, because concurrent * writes/allocations could still use the currently-unallocated * region. */ uint64_t ubsync_blkid = RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; uint64_t next_overwrite_blkid = ubsync_blkid + ubsync_blkid / old_children - old_children; VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); if (blkid >= next_overwrite_blkid) { raidz_reflow_record_progress(vre, next_overwrite_blkid << ashift, tx); return (B_TRUE); } size = MIN(size, raidz_expand_max_copy_bytes); size = MIN(size, (uint64_t)old_children * MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); size = MAX(size, 1 << ashift); uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); size = (uint64_t)blocks << ashift; zfs_range_tree_remove(rt, offset, size); uint_t reads = MIN(blocks, old_children); uint_t writes = MIN(blocks, vd->vdev_children); raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + sizeof (zio_t *) * writes, KM_SLEEP); rra->rra_vre = vre; rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, offset, size, RL_WRITER); rra->rra_txg = dmu_tx_get_txg(tx); rra->rra_ashift = ashift; rra->rra_tbd = reads; rra->rra_writes = writes; raidz_reflow_record_progress(vre, offset + size, tx); /* * SCL_STATE will be released when the read and write are done, * by raidz_reflow_write_done(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* check if a replacing vdev was added, if so treat it as an error */ if (vdev_raidz_expand_child_replacing(vd)) { zfs_dbgmsg("replacing vdev encountered, reflow paused at " "offset=%llu txg=%llu", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_txg); mutex_enter(&vre->vre_lock); vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); cv_signal(&vre->vre_cv); mutex_exit(&vre->vre_lock); /* drop everything we acquired */ spa_config_exit(spa, SCL_STATE, spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); return (B_TRUE); } mutex_enter(&vre->vre_lock); vre->vre_outstanding_bytes += size; mutex_exit(&vre->vre_lock); /* Allocate ABD and ZIO for each child we write. */ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; zio_t *pio = spa->spa_txg_zio[txgoff]; uint_t b = blocks / vd->vdev_children; uint_t bb = blocks % vd->vdev_children; for (uint_t i = 0; i < writes; i++) { uint_t n = b + (i < bb); abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % vd->vdev_children], ((blkid + i) / vd->vdev_children) << ashift, abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); } /* * Allocate and issue ZIO for each child we read. For reads of only * one block we can use respective writer ABDs, since they will also * have only one block. For bigger reads create gang ABDs and fill * them with respective blocks from writer ABDs. */ b = blocks / old_children; bb = blocks % old_children; for (uint_t i = 0; i < reads; i++) { uint_t n = b + (i < bb); abd_t *abd; if (n > 1) { abd = abd_alloc_gang(); for (uint_t j = 0; j < n; j++) { uint_t b = j * old_children + i; abd_t *cabd = abd_get_offset_size( rra->rra_zio[b % vd->vdev_children]->io_abd, (b / vd->vdev_children) << ashift, 1 << ashift); abd_gang_add(abd, cabd, B_TRUE); } } else { abd = rra->rra_zio[i]->io_abd; } zio_nowait(zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % old_children], ((blkid + i) / old_children) << ashift, abd, n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); } return (B_FALSE); } /* * For testing (ztest specific) */ static void raidz_expand_pause(uint_t pause_point) { while (raidz_expand_pause_point != 0 && raidz_expand_pause_point <= pause_point) delay(hz); } static void raidz_scratch_child_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); } /* * Reflow the beginning portion of the vdev into an intermediate scratch area * in memory and on disk. This operation must be persisted on disk before we * proceed to overwrite the beginning portion with the reflowed data. * * This multi-step task can fail to complete if disk errors are encountered * and we can return here after a pause (waiting for disk to become healthy). */ static void raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) { vdev_raidz_expand_t *vre = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; zio_t *pio; int error; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 1 << ashift); /* * The scratch space must be large enough to get us to the point * that one row does not overlap itself when moved. This is checked * by vdev_raidz_attach_check(). */ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); VERIFY3U(write_size, <=, read_size); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 0, logical_size, RL_WRITER); abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(read_size, B_FALSE); } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); /* * If we have already written the scratch area then we must read from * there, since new writes were redirected there while we were paused * or the original location may have been partially overwritten with * reflowed data. */ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); /* * Read from scratch space. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE * to the offset to calculate the physical offset to * write to. Passing in a negative offset makes us * access the scratch area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading scratch location", error); goto io_error_exit; } goto overwrite; } /* * Read from original location. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children - 1; i++) { ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], read_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading original location", error); io_error_exit: for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); return; } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); /* * Reflow in memory. */ uint64_t logical_sectors = logical_size >> ashift; for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { int oldchild = i % (raidvd->vdev_children - 1); uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; int newchild = i % raidvd->vdev_children; uint64_t newoff = (i / raidvd->vdev_children) << ashift; /* a single sector should not be copying over itself */ ASSERT(!(newchild == oldchild && newoff == oldoff)); abd_copy_off(abds[newchild], abds[oldchild], newoff, oldoff, 1 << ashift); } /* * Verify that we filled in everything we intended to (write_size on * each child). */ VERIFY0(logical_sectors % raidvd->vdev_children); VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, write_size); /* * Write to scratch location (boot area). */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d writing scratch location", error); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", (long long)logical_size); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); /* * Update uberblock to indicate that scratch space is valid. This is * needed because after this point, the real location may be * overwritten. If we crash, we need to get the data from the * scratch space, rather than the real location. * * Note: ub_timestamp is bumped so that vdev_uberblock_compare() * will prefer this uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); /* * Overwrite with reflow'ed data. */ overwrite: pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { /* * When we exit early here and drop the range lock, new * writes will go into the scratch area so we'll need to * read from there when we return after pausing. */ zfs_dbgmsg("reflow: error %d writing real location", error); /* * Update the uberblock that is written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, logical_size); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); /* * Update uberblock to indicate that the initial part has been * reflow'ed. This is needed because after this point (when we exit * the rangelock), we allow regular writes to this region, which will * be written to the new location only (because reflow_offset_next == * reflow_offset_synced). If we crashed and re-copied from the * scratch space, we would lose the regular writes. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); /* * Update progress. */ vre->vre_offset = logical_size; zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note - raidz_reflow_sync() will update the uberblock state to * RRSS_SCRATCH_INVALID_SYNCED_REFLOW */ raidz_reflow_sync(spa, tx); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); } /* * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work * here. No other i/o can be in progress, so we don't need the vre_rangelock. */ void vdev_raidz_reflow_copy_scratch(spa_t *spa) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); ASSERT0(logical_size % raidvd->vdev_children); uint64_t write_size = logical_size / raidvd->vdev_children; zio_t *pio; /* * Read from scratch space. */ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(write_size, B_FALSE); } pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); /* * Overwrite real location with reflow'ed data. */ pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " "to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); /* * Update uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); spa->spa_ubsync.ub_timestamp++; VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow recovery: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, spa_first_txg(spa)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset = logical_size; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note that raidz_reflow_sync() will update the uberblock once more */ raidz_reflow_sync(spa, tx); dmu_tx_commit(tx); spa_config_exit(spa, SCL_STATE, FTAG); } static boolean_t spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; return (spa->spa_raidz_expand != NULL && !spa->spa_raidz_expand->vre_waiting_for_resilver); } /* * RAIDZ expansion background thread * * Can be called multiple times if the reflow is paused */ static void spa_raidz_expand_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) vre->vre_offset = 0; else vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); /* Reflow the begining portion using the scratch area */ if (vre->vre_offset == 0) { VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_scratch_sync, vre, 0, ZFS_SPACE_CHECK_NONE)); /* if we encountered errors then pause */ if (vre->vre_offset == 0) { mutex_enter(&vre->vre_lock); vre->vre_waiting_for_resilver = B_TRUE; mutex_exit(&vre->vre_lock); return; } } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); uint64_t guid = raidvd->vdev_guid; /* Iterate over all the remaining metaslabs */ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; i < raidvd->vdev_ms_count && !zthr_iscancelled(zthr) && vre->vre_failed_offset == UINT64_MAX; i++) { metaslab_t *msp = raidvd->vdev_ms[i]; metaslab_disable(msp); mutex_enter(&msp->ms_lock); /* * The metaslab may be newly created (for the expanded * space), in which case its trees won't exist yet, * so we need to bail out early. */ if (msp->ms_new) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } VERIFY0(metaslab_load(msp)); /* * We want to copy everything except the free (allocatable) * space. Note that there may be a little bit more free * space (e.g. in ms_defer), and it's fine to copy that too. */ uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, start, shift); zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); mutex_exit(&msp->ms_lock); /* * Force the last sector of each metaslab to be copied. This * ensures that we advance the on-disk progress to the end of * this metaslab while the metaslab is disabled. Otherwise, we * could move past this metaslab without advancing the on-disk * progress, and then an allocation to this metaslab would not * be copied. */ int sectorsz = 1 << raidvd->vdev_ashift; uint64_t ms_last_offset = msp->ms_start + msp->ms_size - sectorsz; if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { zfs_range_tree_add(rt, ms_last_offset, sectorsz); } /* * When we are resuming from a paused expansion (i.e. * when importing a pool with a expansion in progress), * discard any state that we have already processed. */ if (vre->vre_offset > msp->ms_start) { zfs_range_tree_clear(rt, msp->ms_start, vre->vre_offset - msp->ms_start); } while (!zthr_iscancelled(zthr) && !zfs_range_tree_is_empty(rt) && vre->vre_failed_offset == UINT64_MAX) { /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * If requested, pause the reflow when the amount * specified by raidz_expand_max_reflow_bytes is reached * * This pause is only used during testing or debugging. */ while (raidz_expand_max_reflow_bytes != 0 && raidz_expand_max_reflow_bytes <= vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { delay(hz); } mutex_enter(&vre->vre_lock); while (vre->vre_outstanding_bytes > raidz_expand_max_copy_bytes) { cv_wait(&vre->vre_cv, &vre->vre_lock); } mutex_exit(&vre->vre_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. Theoretically, the * vdev_t that we're expanding may have changed. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); boolean_t needsync = raidz_reflow_impl(raidvd, vre, rt, tx); dmu_tx_commit(tx); if (needsync) { spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_enable(msp, B_FALSE, B_FALSE); zfs_range_tree_vacate(rt, NULL, NULL); zfs_range_tree_destroy(rt); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * The txg_wait_synced() here ensures that all reflow zio's have * completed, and vre_failed_offset has been set if necessary. It * also ensures that the progress of the last raidz_reflow_sync() is * written to disk before raidz_reflow_complete_sync() changes the * in-memory vre_state. vdev_raidz_io_start() uses vre_state to * determine if a reflow is in progress, in which case we may need to * write to both old and new locations. Therefore we can only change * vre_state once this is not necessary, which is once the on-disk * progress (in spa_ubsync) has been set past any possible writes (to * the end of the last metaslab). */ txg_wait_synced(spa->spa_dsl_pool, 0); if (!zthr_iscancelled(zthr) && vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { /* * We are not being canceled or paused, so the reflow must be * complete. In that case also mark it as completed on disk. */ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_complete_sync, spa, 0, ZFS_SPACE_CHECK_NONE)); (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); } else { /* * Wait for all copy zio's to complete and for all the * raidz_reflow_sync() synctasks to be run. */ spa_history_log_internal(spa, "reflow pause", NULL, "offset=%llu failed_offset=%lld", (long long)vre->vre_offset, (long long)vre->vre_failed_offset); mutex_enter(&vre->vre_lock); if (vre->vre_failed_offset != UINT64_MAX) { /* * Reset progress so that we will retry everything * after the point that something failed. */ vre->vre_offset = vre->vre_failed_offset; vre->vre_failed_offset = UINT64_MAX; vre->vre_waiting_for_resilver = B_TRUE; } mutex_exit(&vre->vre_lock); } } void spa_start_raidz_expansion_thread(spa_t *spa) { ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", spa_raidz_expand_thread_check, spa_raidz_expand_thread, spa, defclsyspri); } void raidz_dtl_reassessed(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (spa->spa_raidz_expand != NULL) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * we get called often from vdev_dtl_reassess() so make * sure it's our vdev and any replacing is complete */ if (vd->vdev_top->vdev_id == vre->vre_vdev_id && !vdev_raidz_expand_child_replacing(vd->vdev_top)) { mutex_enter(&vre->vre_lock); if (vre->vre_waiting_for_resilver) { vdev_dbgmsg(vd, "DTL reassessed, " "continuing raidz expansion"); vre->vre_waiting_for_resilver = B_FALSE; zthr_wakeup(spa->spa_raidz_expand_zthr); } mutex_exit(&vre->vre_lock); } } } int vdev_raidz_attach_check(vdev_t *new_child) { vdev_t *raidvd = new_child->vdev_parent; uint64_t new_children = raidvd->vdev_children; /* * We use the "boot" space as scratch space to handle overwriting the * initial part of the vdev. If it is too small, then this expansion * is not allowed. This would be very unusual (e.g. ashift > 13 and * >200 children). */ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { return (EINVAL); } return (0); } void vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) { vdev_t *new_child = arg; spa_t *spa = new_child->vdev_spa; vdev_t *raidvd = new_child->vdev_parent; vdev_raidz_t *vdrz = raidvd->vdev_tsd; ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); ASSERT3P(raidvd->vdev_top, ==, raidvd); ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, new_child); spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); vdrz->vd_physical_width++; VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; vdrz->vn_vre.vre_offset = 0; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; spa->spa_raidz_expand = &vdrz->vn_vre; zthr_wakeup(spa->spa_raidz_expand_zthr); /* * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get * written to the config. */ vdev_config_dirty(raidvd); vdrz->vn_vre.vre_start_time = gethrestime_sec(); vdrz->vn_vre.vre_end_time = 0; vdrz->vn_vre.vre_state = DSS_SCANNING; vdrz->vn_vre.vre_bytes_copied = 0; uint64_t state = vdrz->vn_vre.vre_state; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t start_time = vdrz->vn_vre.vre_start_time; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time, tx)); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); spa_history_log_internal(spa, "raidz vdev expansion started", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)raidvd->vdev_id, (unsigned long long)raidvd->vdev_children); } int vdev_raidz_load(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; int err; uint64_t state = DSS_NONE; uint64_t start_time = 0; uint64_t end_time = 0; uint64_t bytes_copied = 0; if (vd->vdev_top_zap != 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (bytes_copied), 1, &bytes_copied); if (err != 0 && err != ENOENT) return (err); } /* * If we are in the middle of expansion, vre_state should have * already been set by vdev_raidz_init(). */ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; vdrz->vn_vre.vre_start_time = start_time; vdrz->vn_vre.vre_end_time = end_time; vdrz->vn_vre.vre_bytes_copied = bytes_copied; return (0); } int spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (vre == NULL) { /* no removal in progress; find most recent completed */ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; if (vd->vdev_ops == &vdev_raidz_ops) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vdrz->vn_vre.vre_end_time != 0 && (vre == NULL || vdrz->vn_vre.vre_end_time > vre->vre_end_time)) { vre = &vdrz->vn_vre; } } } } if (vre == NULL) { return (SET_ERROR(ENOENT)); } pres->pres_state = vre->vre_state; pres->pres_expanding_vdev = vre->vre_vdev_id; vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); pres->pres_to_reflow = vd->vdev_stat.vs_alloc; mutex_enter(&vre->vre_lock); pres->pres_reflowed = vre->vre_bytes_copied; for (int i = 0; i < TXG_SIZE; i++) pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; mutex_exit(&vre->vre_lock); pres->pres_start_time = vre->vre_start_time; pres->pres_end_time = vre->vre_end_time; pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; return (0); } /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error != 0) return (SET_ERROR(EINVAL)); uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(EINVAL)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vn_vre.vre_vdev_id = -1; vdrz->vn_vre.vre_offset = UINT64_MAX; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; /* note, the ID does not exist when creating a pool */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &vdrz->vn_vre.vre_vdev_id); boolean_t reflow_in_progress = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); if (reflow_in_progress) { spa->spa_raidz_expand = &vdrz->vn_vre; vdrz->vn_vre.vre_state = DSS_SCANNING; } vdrz->vd_original_width = children; uint64_t *txgs; unsigned int txgs_size = 0; error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, &txgs, &txgs_size); if (error == 0) { for (int i = 0; i < txgs_size; i++) { reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = txgs[txgs_size - i - 1]; re->re_logical_width = vdrz->vd_physical_width - i; if (reflow_in_progress) re->re_logical_width--; avl_add(&vdrz->vd_expand_txgs, re); } vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; } if (reflow_in_progress) { vdrz->vd_original_width--; zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", children, txgs_size); } *tsd = vdrz; return (0); } static void vdev_raidz_fini(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) vd->vdev_spa->spa_raidz_expand = NULL; reflow_node_t *re; void *cookie = NULL; avl_tree_t *tree = &vdrz->vd_expand_txgs; while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) kmem_free(re, sizeof (*re)); avl_destroy(&vdrz->vd_expand_txgs); mutex_destroy(&vdrz->vd_expand_lock); mutex_destroy(&vdrz->vn_vre.vre_lock); cv_destroy(&vdrz->vn_vre.vre_cv); zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); kmem_free(vdrz, sizeof (*vdrz)); } /* * Add RAIDZ specific fields to the config nvlist. */ static void vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); vdev_raidz_t *vdrz = vd->vdev_tsd; /* * Make sure someone hasn't managed to sneak a fancy new vdev * into a crufty old storage pool. */ ASSERT(vdrz->vd_nparity == 1 || (vdrz->vd_nparity <= 2 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || (vdrz->vd_nparity <= 3 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add these even on storage pools where they * aren't strictly required -- older software will just ignore * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); if (vdrz->vn_vre.vre_state == DSS_SCANNING) { fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, KM_SLEEP); uint64_t i = 0; for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { txgs[i++] = re->re_txg; } fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, txgs, count); kmem_free(txgs, sizeof (uint64_t) * count); } mutex_exit(&vdrz->vd_expand_lock); } static uint64_t vdev_raidz_nparity(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; return (vdrz->vd_nparity); } static uint64_t vdev_raidz_ndisks(vdev_t *vd) { return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { .vdev_op_init = vdev_raidz_init, .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, - .vdev_op_asize = vdev_raidz_asize, + .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, + .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, .vdev_op_min_asize = vdev_raidz_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, .vdev_op_need_resilver = vdev_raidz_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_raidz_config_generate, .vdev_op_nparity = vdev_raidz_nparity, .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, "For testing, pause RAIDZ expansion after reflowing this many bytes"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, "Max amount of concurrent i/o for RAIDZ expansion"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, "For expanded RAIDZ, aggregate reads that have more rows than this"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index ea6f86993088..21cb57e38b12 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -1,1182 +1,1182 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2024 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the sequential reconstruction implementation for * resilvering. This form of resilvering is internally referred to as device * rebuild to avoid conflating it with the traditional healing reconstruction * performed by the dsl scan code. * * When replacing a device, or scrubbing the pool, ZFS has historically used * a process called resilvering which is a form of healing reconstruction. * This approach has the advantage that as blocks are read from disk their * checksums can be immediately verified and the data repaired. Unfortunately, * it also results in a random IO pattern to the disk even when extra care * is taken to sequentialize the IO as much as possible. This substantially * increases the time required to resilver the pool and restore redundancy. * * For mirrored devices it's possible to implement an alternate sequential * reconstruction strategy when resilvering. Sequential reconstruction * behaves like a traditional RAID rebuild and reconstructs a device in LBA * order without verifying the checksum. After this phase completes a second * scrub phase is started to verify all of the checksums. This two phase * process will take longer than the healing reconstruction described above. * However, it has that advantage that after the reconstruction first phase * completes redundancy has been restored. At this point the pool can incur * another device failure without risking data loss. * * There are a few noteworthy limitations and other advantages of resilvering * using sequential reconstruction vs healing reconstruction. * * Limitations: * * - Sequential reconstruction is not possible on RAIDZ due to its * variable stripe width. Note dRAID uses a fixed stripe width which * avoids this issue, but comes at the expense of some usable capacity. * * - Block checksums are not verified during sequential reconstruction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the * last active resilver completes the pool is automatically scrubbed * by default. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver * it must be restarted. * * Advantages: * * - Sequential reconstruction is performed in LBA order which may be faster * than healing reconstruction particularly when using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. * This allows it to issue larger IOs to disk which span multiple blocks * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, * sequential reconstruction is handled by the top-level vdevs. This * allows for it to be started or canceled on a top-level vdev without * impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a * healing resilver or scrub which will not repair that data. */ /* * Size of rebuild reads; defaults to 1MiB per data disk and is capped at * SPA_MAXBLOCKSIZE. */ static uint64_t zfs_rebuild_max_segment = 1024 * 1024; /* * Maximum number of parallelly executed bytes per leaf vdev caused by a * sequential resilver. We attempt to strike a balance here between keeping * the vdev queues full of I/Os at all times and not overflowing the queues * to cause long latency, which would cause long txg sync times. * * A large default value can be safely used here because the default target * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * * 64MB was observed to deliver the best performance and set as the default. * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) * and a rebuild rate of 1.2GB/s was measured to the distribute spare. * Smaller values were unable to fully saturate the available pool I/O. */ static uint64_t zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver * completes in order to verify the checksums of all blocks which have been * resilvered. This option is enabled by default and is strongly recommended. */ static int zfs_rebuild_scrub_enabled = 1; /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); /* * Clear the per-vdev rebuild bytes value for a vdev tree. */ static void clear_rebuild_bytes(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (uint64_t i = 0; i < vd->vdev_children; i++) clear_rebuild_bytes(vd->vdev_child[i]); mutex_enter(&vd->vdev_stat_lock); vs->vs_rebuild_processed = 0; mutex_exit(&vd->vdev_stat_lock); } /* * Determines whether a vdev_rebuild_thread() should be stopped. */ static boolean_t vdev_rebuild_should_stop(vdev_t *vd) { return (!vdev_writeable(vd) || vd->vdev_removing || vd->vdev_rebuild_exit_wanted || vd->vdev_rebuild_cancel_wanted || vd->vdev_rebuild_reset_wanted); } /* * Determine if the rebuild should be canceled. This may happen when all * vdevs with MISSING DTLs are detached. */ static boolean_t vdev_rebuild_should_cancel(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) return (B_TRUE); return (B_FALSE); } /* * The sync task for updating the on-disk state of a rebuild. This is * scheduled by vdev_rebuild_range(). */ static void vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&vd->vdev_rebuild_lock); if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; vr->vr_scan_offset[txg & TXG_MASK] = 0; } vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); mutex_exit(&vd->vdev_rebuild_lock); } /* * Initialize the on-disk state for a new rebuild, start the rebuild thread. */ static void vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; ASSERT(vd->vdev_rebuilding); spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); mutex_enter(&vd->vdev_rebuild_lock); memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_start_time = gethrestime_sec(); vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* * Rebuilds are currently only used when replacing a device, in which * case there must be DTL_MISSING entries. In the future, we could * allow rebuilds to be used in a way similar to a scrub. This would * be useful because it would allow us to rebuild the space used by * pool checkpoints. */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu started", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); spa_event_notify(spa, vd, aux, name); nvlist_free(aux); } /* * Called to request that a new rebuild be started. The feature will remain * active for the duration of the rebuild, then revert to the enabled state. */ static void vdev_rebuild_initiate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_top == vd); ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); vd->vdev_rebuilding = B_TRUE; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); } /* * Update the on-disk state to completed when a rebuild finishes. */ static void vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); /* * Handle a second device failure if it occurs after all rebuild I/O * has completed but before this sync task has been executed. */ if (vd->vdev_rebuild_reset_wanted) { mutex_exit(&vd->vdev_rebuild_lock); vdev_rebuild_reset_sync(arg, tx); return; } vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu complete", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); /* Handles detaching of spares */ spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); /* * While we're in syncing context take the opportunity to * setup the scrub when there are no more active rebuilds. */ setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; if (dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 && zfs_rebuild_scrub_enabled) { dsl_scan_setup_sync(&setup_sync_arg, tx); } cv_broadcast(&vd->vdev_rebuild_cv); /* Clear recent error events (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, NULL); } /* * Update the on-disk state to canceled when a rebuild finishes. */ static void vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu canceled", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); vd->vdev_rebuild_cancel_wanted = B_FALSE; vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); spa_notify_waiters(spa); cv_broadcast(&vd->vdev_rebuild_cv); } /* * Resets the progress of a running rebuild. This will occur when a new * vdev is added to rebuild. */ static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vrp->vrp_last_offset = 0; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_bytes_scanned = 0; vrp->vrp_bytes_issued = 0; vrp->vrp_bytes_rebuilt = 0; vrp->vrp_bytes_est = 0; vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* See vdev_rebuild_initiate_sync comment */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu reset", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vd->vdev_rebuild_reset_wanted = B_FALSE; ASSERT(vd->vdev_rebuilding); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } /* * Clear the last rebuild status. */ void vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; objset_t *mos = spa_meta_objset(spa); mutex_enter(&vd->vdev_rebuild_lock); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { mutex_exit(&vd->vdev_rebuild_lock); return; } clear_rebuild_bytes(vd); memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { VERIFY0(zap_update(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); } mutex_exit(&vd->vdev_rebuild_lock); } /* * The zio_done_func_t callback for each rebuild I/O issued. It's responsible * for updating the rebuild stats and limiting the number of in flight I/Os. */ static void vdev_rebuild_cb(zio_t *zio) { vdev_rebuild_t *vr = zio->io_private; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; mutex_enter(&vr->vr_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. * Attempt to roll back to the last completed offset, in order * resume from the correct location if the pool is resumed. * (This works because spa_sync waits on spa_txg_zio before * it runs sync tasks.) */ uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else if (zio->io_error) { vrp->vrp_errors++; } abd_free(zio->io_abd); ASSERT3U(vr->vr_bytes_inflight, >, 0); vr->vr_bytes_inflight -= zio->io_size; cv_broadcast(&vr->vr_io_cv); mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * Initialize a block pointer that can be used to read the given segment * for sequential rebuild. */ static void vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, uint64_t asize) { ASSERT(vd->vdev_ops == &vdev_draid_ops || vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? - vdev_draid_asize_to_psize(vd, asize) : asize; + vdev_draid_asize_to_psize(vd, asize, 0) : asize; BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], start); DVA_SET_GANG(&bp->blk_dva[0], 0); DVA_SET_ASIZE(&bp->blk_dva[0], asize); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, psize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); } /* * Issues a rebuild I/O and takes care of rate limiting the number of queued * rebuild I/Os. The provided start and size must be properly aligned for the * top-level vdev type being rebuilt. */ static int vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) { uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; blkptr_t blk; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; /* * Rebuild the data in this range by constructing a special block * pointer. It has no relation to any existing blocks in the pool. * However, by disabling checksum verification and issuing a scrub IO * we can reconstruct and repair any children with missing data. */ vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { vr->vr_pass_bytes_skipped += size; return (0); } mutex_enter(&vr->vr_io_lock); /* Limit in flight rebuild I/Os */ while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); vr->vr_bytes_inflight += psize; mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); /* This is the first I/O for this txg. */ if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { vr->vr_scan_offset[txg & TXG_MASK] = start; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_update_sync, (void *)(uintptr_t)vd->vdev_id, tx); } /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { mutex_enter(&vr->vr_io_lock); vr->vr_bytes_inflight -= psize; mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); vr->vr_scan_offset[txg & TXG_MASK] = start + size; vr->vr_pass_bytes_issued += size; vr->vr_rebuild_phys.vrp_bytes_issued += size; zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); return (0); } /* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ static int vdev_rebuild_ranges(vdev_rebuild_t *vr) { vdev_t *vd = vr->vr_top_vdev; zfs_btree_t *t = &vr->vr_scan_tree->rt_root; zfs_btree_index_t idx; int error; for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { uint64_t start = zfs_rs_get_start(rs, vr->vr_scan_tree); uint64_t size = zfs_rs_get_end(rs, vr->vr_scan_tree) - start; /* * zfs_scan_suspend_progress can be set to disable rebuild * progress for testing. See comment in dsl_scan_sync(). */ while (zfs_scan_suspend_progress && !vdev_rebuild_should_stop(vd)) { delay(hz); } while (size > 0) { uint64_t chunk_size; /* * Split range into legally-sized logical chunks * given the constraints of the top-level vdev * being rebuilt (dRAID or mirror). */ ASSERT3P(vd->vdev_ops, !=, NULL); chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, start, size, zfs_rebuild_max_segment); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) return (error); size -= chunk_size; start += chunk_size; } } return (0); } /* * Calculates the estimated capacity which remains to be scanned. Since * we traverse the pool in metaslab order only allocated capacity beyond * the vrp_last_offset need be considered. All lower offsets must have * already been rebuilt and are thus already included in vrp_bytes_scanned. */ static void vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t bytes_est = vrp->vrp_bytes_scanned; if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) return; for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); bytes_est += metaslab_allocated_space(msp); mutex_exit(&msp->ms_lock); } vrp->vrp_bytes_est = bytes_est; } /* * Load from disk the top-level vdev's rebuild information. */ int vdev_rebuild_load(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; spa_t *spa = vd->vdev_spa; int err = 0; mutex_enter(&vd->vdev_rebuild_lock); vd->vdev_rebuilding = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); mutex_exit(&vd->vdev_rebuild_lock); return (SET_ERROR(ENOTSUP)); } ASSERT(vd->vdev_top == vd); err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp); /* * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should * not prevent a pool from being imported. Clear the rebuild * status allowing a new resilver/rebuild to be started. */ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); } else if (err) { mutex_exit(&vd->vdev_rebuild_lock); return (err); } vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; vr->vr_top_vdev = vd; mutex_exit(&vd->vdev_rebuild_lock); return (0); } /* * Each scan thread is responsible for rebuilding a top-level vdev. The * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. */ static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* * If there's a scrub in process request that it be stopped. This * is not required for a correct rebuild, but we do want rebuilds to * emulate the resilver behavior as much as possible. */ dsl_pool_t *dsl = spa_get_dsl(spa); if (dsl_scan_scrubbing(dsl)) dsl_scan_cancel(dsl); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; vr->vr_pass_bytes_skipped = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); clear_rebuild_bytes(vr->vr_top_vdev); mutex_exit(&vd->vdev_rebuild_lock); /* * Systematically walk the metaslabs and issue rebuild I/Os for * all ranges in the allocated space map. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; /* * Calculate the max number of in-flight bytes for top-level * vdev scanning operations (minimum 1MB, maximum 1/2 of * arc_c_max shared by all top-level vdevs). Limits for the * issuing phase are done per top-level vdev and are handled * separately. */ uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, zfs_rebuild_vdev_limit * vd->vdev_children)); /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The * vdev_rebuild_cancel_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (vdev_rebuild_should_cancel(vd)) { vd->vdev_rebuild_cancel_wanted = B_TRUE; error = EINTR; break; } ASSERT0(zfs_range_tree_space(vr->vr_scan_tree)); /* Disable any new allocations to this metaslab */ spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * If there are outstanding allocations wait for them to be * synced. This is needed to ensure all allocated ranges are * on disk and therefore will be rebuilt. */ for (int j = 0; j < TXG_SIZE; j++) { if (zfs_range_tree_space(msp->ms_allocating[j])) { mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); break; } } /* * When a metaslab has been allocated from read its allocated * ranges from the space map object into the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, vr->vr_scan_tree, SM_ALLOC)); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(zfs_range_tree_space( msp->ms_allocating[i])); } zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_add, vr->vr_scan_tree); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_remove, vr->vr_scan_tree); /* * Remove ranges which have already been rebuilt based * on the last offset. This can happen when restarting * a scan after exporting and re-importing the pool. */ zfs_range_tree_clear(vr->vr_scan_tree, 0, vrp->vrp_last_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and * frees made to space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, i); } /* * Walk the allocated space map and issue the rebuild I/O. */ error = vdev_rebuild_ranges(vr); zfs_range_tree_vacate(vr->vr_scan_tree, NULL, NULL); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); metaslab_enable(msp, B_FALSE, B_FALSE); if (error != 0) break; } zfs_range_tree_destroy(vr->vr_scan_tree); spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ mutex_enter(&vr->vr_io_lock); while (vr->vr_bytes_inflight > 0) cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); mutex_exit(&vr->vr_io_lock); mutex_destroy(&vr->vr_io_lock); cv_destroy(&vr->vr_io_cv); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); mutex_enter(&vd->vdev_rebuild_lock); if (error == 0) { /* * After a successful rebuild clear the DTLs of all ranges * which were missing when the rebuild was started. These * ranges must have been rebuilt as a consequence of rebuilding * all allocated space. Note that unlike a scrub or resilver * the rebuild operation will reconstruct data only referenced * by a pool checkpoint. See the dsl_scan_done() comments. */ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_cancel_wanted) { /* * The rebuild operation was canceled. This will occur when * a device participating in the rebuild is detached. */ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_reset_wanted) { /* * Reset the running rebuild without canceling and restarting * it. This will occur when a new device is attached and must * participate in the rebuild. */ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else { /* * The rebuild operation should be suspended. This may occur * when detaching a child vdev or when exporting the pool. The * rebuild is left in the active state so it will be resumed. */ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); vd->vdev_rebuilding = B_FALSE; } dmu_tx_commit(tx); vd->vdev_rebuild_thread = NULL; mutex_exit(&vd->vdev_rebuild_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); cv_broadcast(&vd->vdev_rebuild_cv); thread_exit(); } /* * Returns B_TRUE if any top-level vdev are rebuilding. */ boolean_t vdev_rebuild_active(vdev_t *vd) { spa_t *spa = vd->vdev_spa; boolean_t ret = B_FALSE; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) { ret = vdev_rebuild_active(vd->vdev_child[i]); if (ret) return (ret); } } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); mutex_exit(&vd->vdev_rebuild_lock); } return (ret); } /* * Start a rebuild operation. The rebuild may be restarted when the * top-level vdev is currently actively rebuilding. */ void vdev_rebuild(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; ASSERT(vd->vdev_top == vd); ASSERT(vdev_is_concrete(vd)); ASSERT(!vd->vdev_removing); ASSERT(spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD)); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuilding) { ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); /* * Signal a running rebuild operation that it should restart * from the beginning because a new device was attached. The * vdev_rebuild_reset_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (!vd->vdev_rebuild_reset_wanted) vd->vdev_rebuild_reset_wanted = B_TRUE; } else { vdev_rebuild_initiate(vd); } mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_restart_impl(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_restart_impl(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && vdev_writeable(vd) && !vd->vdev_rebuilding) { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); vd->vdev_rebuilding = B_TRUE; vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Conditionally restart all of the vdev_rebuild_thread's for a pool. The * feature flag must be active and the rebuild in the active state. This * cannot be used to start a new rebuild. */ void vdev_rebuild_restart(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } /* * Stop and wait for all of the vdev_rebuild_thread's associated with the * vdev tree provide to be terminated (canceled or stopped). */ void vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_stop_wait(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuild_thread != NULL) { vd->vdev_rebuild_exit_wanted = B_TRUE; while (vd->vdev_rebuilding) { cv_wait(&vd->vdev_rebuild_cv, &vd->vdev_rebuild_lock); } vd->vdev_rebuild_exit_wanted = B_FALSE; } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Stop all rebuild operations but leave them in the active state so they * will be resumed when importing the pool. */ void vdev_rebuild_stop_all(spa_t *spa) { vdev_rebuild_stop_wait(spa->spa_root_vdev); } /* * Rebuild statistics reported per top-level vdev. */ int vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) { spa_t *spa = tvd->vdev_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (SET_ERROR(ENOTSUP)); if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) return (SET_ERROR(EINVAL)); int error = zap_contains(spa_meta_objset(spa), tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); if (error == ENOENT) { memset(vrs, 0, sizeof (vdev_rebuild_stat_t)); vrs->vrs_state = VDEV_REBUILD_NONE; error = 0; } else if (error == 0) { vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&tvd->vdev_rebuild_lock); vrs->vrs_state = vrp->vrp_rebuild_state; vrs->vrs_start_time = vrp->vrp_start_time; vrs->vrs_end_time = vrp->vrp_end_time; vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; vrs->vrs_bytes_est = vrp->vrp_bytes_est; vrs->vrs_errors = vrp->vrp_errors; vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; mutex_exit(&tvd->vdev_rebuild_lock); } return (error); } ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW, "Max segment size in bytes of rebuild reads"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for sequential resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, "Automatically scrub after sequential resilver completes"); diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 8f6e49f25e1e..21a81d6d25b9 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -1,168 +1,169 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include #include #include #include #include /* * Virtual device vector for the pool's root vdev. */ static uint64_t vdev_root_core_tvds(vdev_t *vd) { uint64_t tvds = 0; for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!cvd->vdev_ishole && !cvd->vdev_islog && cvd->vdev_ops != &vdev_indirect_ops) { tvds++; } } return (tvds); } /* * We should be able to tolerate one failure with absolutely no damage * to our metadata. Two failures will take out space maps, a bunch of * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy * place to live. When we get smarter, we can liberalize this policy. * e.g. If we haven't lost two consecutive top-level vdevs, then we are * probably fine. Adding bean counters during alloc/free can make this * future guesswork more accurate. */ static boolean_t too_many_errors(vdev_t *vd, uint64_t numerrors) { uint64_t tvds; if (numerrors == 0) return (B_FALSE); tvds = vdev_root_core_tvds(vd); ASSERT3U(numerrors, <=, tvds); if (numerrors == tvds) return (B_TRUE); return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa)); } static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *ashift, uint64_t *pshift) { spa_t *spa = vd->vdev_spa; int lasterror = 0; int numerrors = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error && !cvd->vdev_islog && cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; } } if (spa_load_state(spa) != SPA_LOAD_NONE) spa_set_missing_tvds(spa, numerrors); if (too_many_errors(vd, numerrors)) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } *asize = 0; *max_asize = 0; *ashift = 0; *pshift = 0; return (0); } static void vdev_root_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static void vdev_root_state_change(vdev_t *vd, int faulted, int degraded) { if (too_many_errors(vd, faulted)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); } else if (degraded || faulted) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } } vdev_ops_t vdev_root_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_root_open, .vdev_op_close = vdev_root_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ .vdev_op_state_change = vdev_root_state_change, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index eb08a6eac3ed..1769606ebb8a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,5842 +1,5881 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. * Copyright (c) 2021, 2024 by George Melikov. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif typedef struct zio_stats { kstat_named_t ziostat_total_allocations; kstat_named_t ziostat_alloc_class_fallbacks; kstat_named_t ziostat_gang_writes; kstat_named_t ziostat_gang_multilevel; } zio_stats_t; static zio_stats_t zio_stats = { { "total_allocations", KSTAT_DATA_UINT64 }, { "alloc_class_fallbacks", KSTAT_DATA_UINT64 }, { "gang_writes", KSTAT_DATA_UINT64 }, { "gang_multilevel", KSTAT_DATA_UINT64 }, }; struct { wmsum_t ziostat_total_allocations; wmsum_t ziostat_alloc_class_fallbacks; wmsum_t ziostat_gang_writes; wmsum_t ziostat_gang_multilevel; } ziostat_sums; #define ZIOSTAT_BUMP(stat) wmsum_add(&ziostat_sums.stat, 1); static kstat_t *zio_ksp; static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); static int zio_kstats_update(kstat_t *ksp, int rw) { zio_stats_t *zs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); zs->ziostat_total_allocations.value.ui64 = wmsum_value(&ziostat_sums.ziostat_total_allocations); zs->ziostat_alloc_class_fallbacks.value.ui64 = wmsum_value(&ziostat_sums.ziostat_alloc_class_fallbacks); zs->ziostat_gang_writes.value.ui64 = wmsum_value(&ziostat_sums.ziostat_gang_writes); zs->ziostat_gang_multilevel.value.ui64 = wmsum_value(&ziostat_sums.ziostat_gang_multilevel); return (0); } void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); wmsum_init(&ziostat_sums.ziostat_total_allocations, 0); wmsum_init(&ziostat_sums.ziostat_alloc_class_fallbacks, 0); wmsum_init(&ziostat_sums.ziostat_gang_writes, 0); wmsum_init(&ziostat_sums.ziostat_gang_multilevel, 0); zio_ksp = kstat_create("zfs", 0, "zio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (zio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_ksp != NULL) { zio_ksp->ks_data = &zio_stats; zio_ksp->ks_update = zio_kstats_update; kstat_install(zio_ksp); } for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t align, cflags, data_cflags; char name[32]; /* * Create cache for each half-power of 2 size, starting from * SPA_MINBLOCKSIZE. It should give us memory space efficiency * of ~7/8, sufficient for transient allocations mostly using * these caches. */ size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; if (!IS_P2ALIGNED(size, p2 / 2)) continue; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif if (IS_P2ALIGNED(size, PAGESIZE)) align = PAGESIZE; else align = 1 << (highbit64(size ^ (size - 1)) - 1); cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; data_cflags = KMC_NODEBUG; if (abd_size_alloc_linear(size)) { cflags |= KMC_RECLAIMABLE; data_cflags |= KMC_RECLAIMABLE; } if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } if (zio_ksp != NULL) { kstat_delete(zio_ksp); zio_ksp = NULL; } wmsum_fini(&ziostat_sums.ziostat_total_allocations); wmsum_fini(&ziostat_sums.ziostat_alloc_class_fallbacks); wmsum_fini(&ziostat_sums.ziostat_gang_writes); wmsum_fini(&ziostat_sums.ziostat_gang_multilevel); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ #if defined(ZFS_DEBUG) && defined(_KERNEL) #define ZFS_ZIO_BUF_CANARY 1 #endif #ifdef ZFS_ZIO_BUF_CANARY static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; /* * Use empty space after the buffer to detect overflows. * * Since zio_init() creates kmem caches only for certain set of buffer sizes, * allocations of different sizes may have some unused space after the data. * Filling part of that space with a known pattern on allocation and checking * it on free should allow us to detect some buffer overflows. */ static void zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) *canary = zio_buf_canary; } static void zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) { if (unlikely(*canary != zio_buf_canary)) { PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", p, size, (canary - p) * sizeof (ulong_t), *canary, zio_buf_canary); } } } #endif /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_buf_cache, c); #endif return (p); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_data_buf_cache, c); #endif return (p); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_buf_cache, c); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_data_buf_cache, c); #endif kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, data, zio->io_size, size, &zio->io_prop.zp_complevel); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ abd_t *abd = abd_alloc_linear(lsize, B_TRUE); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, abd, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { abd_free(abd); ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, abd, lsize, BP_SHOULD_BYTESWAP(bp), mac); abd_free(abd); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); /* Parent should not have READY stage if child doesn't have it. */ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && (cio->io_child_type != ZIO_CHILD_VDEV), (pio->io_pipeline & ZIO_STAGE_READY) == 0); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } void zio_add_child_first(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); /* Parent should not have READY stage if child doesn't have it. */ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && (cio->io_child_type != ZIO_CHILD_VDEV), (pio->io_pipeline & ZIO_STAGE_READY) == 0); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; ASSERT(list_is_empty(&cio->io_parent_list)); list_insert_head(&cio->io_parent_list, zl); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); /* * Propogate the Direct I/O checksum verify failure to the parent. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We do this if the parent's zio type matches the child's * type, or if it's a zio_null() with no done callback, and so * has no actual work to do. Otherwise dispatch the parent zio * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && (pio->io_type == zio->io_type || (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) { zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ } else { zio->io_bp = (blkptr_t *)bp; } zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; + if (flags & ZIO_FLAG_PREALLOCATED) { + BP_ZERO_DVAS(zio->io_bp); + BP_SET_BIRTH(zio->io_bp, 0, 0); + } } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } /* * ZIO intended to be between others. Provides synchronization at READY * and DONE pipeline stages and calls the respective callbacks. */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } /* * ZIO intended to be a root of a tree. Unlike null ZIO does not have a * READY pipeline stage (is ready on creation), so it should not be used * as child of any ZIO that may need waiting for grandchildren READY stage * (any other ZIO type). */ zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); return (zio); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)BP_GET_PHYSICAL_BIRTH(bp), (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out 0 is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (unlikely(!DMU_OT_IS_VALID(BP_GET_TYPE(bp)))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (unlikely(BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (unlikely(BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (unlikely(BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } if (unlikely(BPE_GET_PSIZE(bp) > BPE_PAYLOAD_SIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } return (errors ? ECKSUM : 0); } else if (BP_IS_HOLE(bp)) { /* * Holes are allowed (expected, even) to have no DVAs, no * checksum, and no psize. */ return (errors ? ECKSUM : 0); } else if (unlikely(!DVA_IS_VALID(&bp->blk_dva[0]))) { /* Non-hole, non-embedded BPs _must_ have at least one DVA */ errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has no valid DVAs", bp); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (unlikely(BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) return (errors ? ECKSUM : 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_NEEDED_TRY: if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER)) return (EBUSY); break; case BLK_CONFIG_SKIP: return (errors ? ECKSUM : 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the logical birth * and physical birth are not too large. However, * spa_freeze() allows the birth time of log blocks (and * dmu_sync()-ed blocks that are in the log) to be arbitrarily * large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (unlikely(vdevid >= spa->spa_root_vdev->vdev_children)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (unlikely(vd == NULL)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (unlikely(vd->vdev_ops == &vdev_hole_ops)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (unlikely(offset + asize > vd->vdev_asize)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED || blk_config == BLK_CONFIG_NEEDED_TRY) spa_config_exit(spa, SCL_VDEV, bp); return (errors ? ECKSUM : 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ? ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE; zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, pipeline); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_prop.zp_gang_copies = gang_copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; /* * We never allow the mirror VDEV to attempt reading from any * additional data copies after the first Direct I/O checksum * verify failure. This is to avoid bad data being written out * through the mirror during self healing. See comment in * vdev_mirror_io_done() for more details. */ ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); } else if (type == ZIO_TYPE_WRITE && pio->io_prop.zp_direct_write == B_TRUE) { /* * By default we only will verify checksums for Direct I/O * writes for Linux. FreeBSD is able to place user pages under * write protection before issuing them to the ZIO pipeline. * * Checksum validation errors will only be reported through * the top-level VDEV, which is set by this child ZIO. */ ASSERT3P(bp, !=, NULL); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } /* * Send a flush command to the given vdev. Unlike most zio creation functions, * the flush zios are issued immediately. You can wait on pio to pause until * the flushes complete. */ void zio_flush(zio_t *pio, vdev_t *vd) { const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY; if (vd->vdev_nowritecache) return; if (vd->vdev_children == 0) { zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); } } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * Round provided allocation size up to a value that can be allocated * by at least some vdev(s) in the pool with minimum or no additional * padding and without extra space usage on others */ static uint64_t zio_roundup_alloc_size(spa_t *spa, uint64_t size) { if (size > spa->spa_min_alloc) return (roundup(size, spa->spa_gcd_alloc)); return (spa->spa_min_alloc); } size_t zio_get_compression_max_size(enum zio_compress compress, uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len) { size_t d_len; /* minimum 12.5% must be saved (legacy value, may be changed later) */ d_len = s_len - (s_len >> 3); /* ZLE can't use exactly d_len bytes, it needs more, so ignore it */ if (compress == ZIO_COMPRESS_ZLE) return (d_len); d_len = d_len - d_len % gcd_alloc; if (d_len < min_alloc) return (BPE_PAYLOAD_SIZE); return (d_len); } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || MIN(zp->zp_copies, spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { abd_t *cabd = NULL; if (abd_cmp_zero(zio->io_abd, lsize) == 0) psize = 0; else if (compress == ZIO_COMPRESS_EMPTY) psize = lsize; else psize = zio_compress_data(compress, zio->io_abd, &cabd, lsize, zio_get_compression_max_size(compress, spa->spa_gcd_alloc, spa->spa_min_alloc, lsize), zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cabd != NULL) abd_free(cabd); } else if (psize <= BPE_PAYLOAD_SIZE && !zp->zp_encrypt && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { void *cbuf = abd_borrow_buf_copy(cabd, lsize); encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); abd_return_buf(cabd, cbuf, lsize); abd_free(cabd); BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ size_t rounded = (size_t)zio_roundup_alloc_size(spa, psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; abd_free(cabd); psize = lsize; } else { abd_zero_off(cabd, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cabd, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ if (abd_cmp_zero(zio->io_abd, lsize) == 0) { psize = 0; compress = ZIO_COMPRESS_OFF; } else { psize = lsize; } } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available or cut the line otherwise. */ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; else cutinline = B_TRUE; } ASSERT3U(q, <, ZIO_TASKQ_TYPES); spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; int ticks = MAX(1, NSEC_TO_TICK(diff)); clock_t expire_at_tick = ddi_get_lbolt() + ticks; DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just finish the * zio without a delay. */ zio_interrupt(zio); } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || (pio->io_pipeline & ZIO_STAGE_READY) == 0; pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); /* * It's possible for a failed ZIO to be a descendant of more than one * ZIO tree. When reexecuting it, we have to be sure to add its wait * states to all parent wait counts. * * Those parents, in turn, may have other children that are currently * active, usually because they've already been reexecuted after * resuming. Those children may be executing and may call * zio_notify_parent() at the same time as we're updating our parent's * counts. To avoid races while updating the counts, we take * gio->io_lock before each update. */ zio_link_t *zl = NULL; while ((gio = zio_walk_parents(pio, &zl)) != NULL) { mutex_enter(&gio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) { gio->io_children[pio->io_child_type][w] += !pio->io_state[w]; } mutex_exit(&gio->io_lock); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); if (reason != ZIO_SUSPEND_MMP) { cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " "I/O failure and has been suspended.", spa_name(spa)); } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspended != ZIO_SUSPEND_NONE) cmn_err(CE_WARN, "Pool '%s' was suspended and is being " "resumed. Failed I/O will be retried.", spa_name(spa)); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; - ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + /* + * If we're getting direct-invoked from zio_write_gang_block(), + * the bp_orig will be set. + */ + ASSERT(BP_IS_HOLE(&zio->io_bp_orig) || + zio->io_flags & ZIO_FLAG_PREALLOCATED); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; - uint64_t psize; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * Store multiple copies of the GBH, so that we can still traverse * all the data (e.g. to free or scrub) even if a block is damaged. * This value respects the redundant_metadata property. */ int gbh_copies = gio->io_prop.zp_gang_copies; if (gbh_copies == 0) { /* * This should only happen in the case where we're filling in * DDT entries for a parent that wants more copies than the DDT * has. In that case, we cannot gang without creating a mixed * blkptr, which is illegal. */ ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT); pio->io_error = EAGAIN; return (pio); } ASSERT3S(gbh_copies, >, 0); ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP); ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio->io_allocator, pio); if (error) { pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { boolean_t more; VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies, zio, B_TRUE, &more)); } /* - * Create and nowait the gang children. + * Create and nowait the gang children. First, we try to do + * opportunistic allocations. If that fails to generate enough + * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; resid -= psize, g++) { - psize = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); - psize = MIN(resid, psize); - ASSERT3U(psize, >=, SPA_MINBLOCKSIZE); - + for (int g = 0; resid != 0; g++) { + flags &= METASLAB_ASYNC_ALLOC; + flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_gang_copies = gio->io_prop.zp_gang_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; zp.zp_direct_write = B_FALSE; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); - zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - has_data ? abd_get_offset(pio->io_abd, pio->io_size - - resid) : NULL, psize, psize, &zp, - zio_write_gang_member_ready, NULL, + uint64_t min_size = zio_roundup_alloc_size(spa, + resid / (SPA_GBH_NBLKPTRS - g)); + min_size = MIN(min_size, resid); + bp = &gbh->zg_blkptr[g]; + + zio_alloc_list_t cio_list; + metaslab_trace_init(&cio_list); + uint64_t allocated_size = UINT64_MAX; + error = metaslab_alloc_range(spa, mc, min_size, resid, + bp, gio->io_prop.zp_copies, txg, NULL, + flags, &cio_list, zio->io_allocator, NULL, &allocated_size); + + boolean_t allocated = error == 0; + + uint64_t psize = allocated ? MIN(resid, allocated_size) : + min_size; + + zio_t *cio = zio_write(zio, spa, txg, bp, has_data ? + abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, + psize, psize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + ZIO_GANG_CHILD_FLAGS(pio) | + (allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark); + resid -= psize; zio_gang_inherit_allocator(zio, cio); + if (allocated) { + metaslab_trace_move(&cio_list, &cio->io_alloc_list); + metaslab_group_alloc_increment_all(spa, + &cio->io_bp_orig, zio->io_allocator, flags, psize, + cio); + } /* * We do not reserve for the child writes, since we already * reserved for the parent. Unreserve though will be called * for individual children. We can do this since sum of all * child's physical sizes is equal to parent's physical size. * It would not work for potentially bigger allocation sizes. */ zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_t *ddt; ddt_entry_t *dde = zio->io_private; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddt = ddt_select(zio->io_spa, bp); if (zio->io_error == 0) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); /* this phys variant doesn't need repair */ ddt_phys_clear(dde->dde_phys, v); } if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) dde->dde_io->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (v_self == DDT_PHYS_NONE) return (zio); /* issue I/O for the other copies */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_io->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { if (DDT_PHYS_IS_DITTO(ddt, p)) continue; if (dde->dde_io == NULL) continue; zio_t *lio = dde->dde_io->dde_lead_zio[p]; if (lio == NULL) continue; if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); /* we're the lead, so once we're done there's no one else outstanding */ if (dde->dde_io->dde_lead_zio[p] == zio) dde->dde_io->dde_lead_zio[p] = NULL; ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; if (zio->io_error != 0) { /* * The write failed, so we're about to abort the entire IO * chain. We need to revert the entry back to what it was at * the last time it was successfully extended. */ ddt_phys_unextend(ddp, orig, v); ddt_phys_clear(orig, v); ddt_exit(ddt); return; } /* * Add references for all dedup writes that were waiting on the * physical one, skipping any other physical writes that are waiting. */ zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_phys_addref(ddp, v); } /* * We've successfully added new DVAs to the entry. Clear the saved * state or, if there's still outstanding IO, remember it so we can * revert to a known good state if that IO fails. */ if (dde->dde_io->dde_lead_zio[p] == NULL) ddt_phys_clear(orig, v); else ddt_phys_copy(orig, ddp, v); ddt_exit(ddt); } static void zio_ddt_child_write_ready(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_is_gang(dde->dde_phys, v)) { for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) { dva_t *d = &zio->io_bp->blk_dva[i]; metaslab_group_alloc_decrement(zio->io_spa, DVA_GET_VDEV(d), zio->io_allocator, METASLAB_ASYNC_ALLOC, zio->io_size, zio); } zio->io_error = EAGAIN; } if (zio->io_error != 0) return; ddt_enter(ddt); ddt_phys_extend(dde->dde_phys, v, zio->io_bp); zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); /* * Deduplication will not take place for Direct I/O writes. The * ddt_tree will be emptied in syncing context. Direct I/O writes take * place in the open-context. Direct I/O write can not attempt to * modify the ddt_tree while issuing out a write. */ ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE); ddt_enter(ddt); /* * Search DDT for matching entry. Skip DVAs verification here, since * they can go only from override, and once we get here the override * pointer can't have "D" flag to be confused with pruned DDT entries. */ IMPLY(zio->io_bp_override, !BP_GET_DEDUP(zio->io_bp_override)); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); if (zio->io_bp_override == NULL) zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; /* * In the common cases, at this point we have a regular BP with no * allocated DVAs, and the corresponding DDT entry for its checksum. * Our goal is to fill the BP with enough DVAs to satisfy its copies= * requirement. * * One of three things needs to happen to fulfill this: * * - if the DDT entry has enough DVAs to satisfy the BP, we just copy * them out of the entry and return; * * - if the DDT entry has no DVAs (ie its brand new), then we have to * issue the write as normal so that DVAs can be allocated and the * data land on disk. We then copy the DVAs into the DDT entry on * return. * * - if the DDT entry has some DVAs, but too few, we have to issue the * write, adjusted to have allocate fewer copies. When it returns, we * add the new DVAs to the DDT entry, and update the BP to have the * full amount it originally requested. * * In all cases, if there's already a writing IO in flight, we need to * defer the action until after the write is done. If our action is to * write, we need to adjust our request for additional DVAs to match * what will be in the DDT entry after it completes. In this way every * IO can be guaranteed to recieve enough DVAs simply by joining the * end of the chain and letting the sequence play out. */ /* * Number of DVAs in the DDT entry. If the BP is encrypted we ignore * the third one as normal. */ int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); boolean_t is_ganged = ddt_phys_is_gang(ddp, v); /* Number of DVAs requested by the IO. */ uint8_t need_dvas = zp->zp_copies; /* Number of DVAs in outstanding writes for this dde. */ uint8_t parent_dvas = 0; /* * What we do next depends on whether or not there's IO outstanding that * will update this entry. */ if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { /* * No IO outstanding, so we only need to worry about ourselves. */ /* * Override BPs bring their own DVAs and their own problems. */ if (zio->io_bp_override) { /* * For a brand-new entry, all the work has been done * for us, and we can just fill it out from the provided * block and leave. */ if (have_dvas == 0) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_extend(ddp, v, bp); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * If we already have this entry, then we want to treat * it like a regular write. To do this we just wipe * them out and proceed like a regular write. * * Even if there are some DVAs in the entry, we still * have to clear them out. We can't use them to fill * out the dedup entry, as they are all referenced * together by a bp already on disk, and will be freed * as a group. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } /* * If there are enough DVAs in the entry to service our request, * then we can just use them as-is. */ if (have_dvas >= need_dvas) { ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * Otherwise, we have to issue IO to fill the entry up to the * amount we need. */ need_dvas -= have_dvas; } else { /* * There's a write in-flight. If there's already enough DVAs on * the entry, then either there were already enough to start * with, or the in-flight IO is between READY and DONE, and so * has extended the entry with new DVAs. Either way, we don't * need to do anything, we can just slot in behind it. */ if (zio->io_bp_override) { /* * If there's a write out, then we're soon going to * have our own copies of this block, so clear out the * override block and treat it as a regular dedup * write. See comment above. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } if (have_dvas >= need_dvas) { /* * A minor point: there might already be enough * committed DVAs in the entry to service our request, * but we don't know which are completed and which are * allocated but not yet written. In this case, should * the IO for the new DVAs fail, we will be on the end * of the IO chain and will also recieve an error, even * though our request could have been serviced. * * This is an extremely rare case, as it requires the * original block to be copied with a request for a * larger number of DVAs, then copied again requesting * the same (or already fulfilled) number of DVAs while * the first request is active, and then that first * request errors. In return, the logic required to * catch and handle it is complex. For now, I'm just * not going to bother with it. */ /* * We always fill the bp here as we may have arrived * after the in-flight write has passed READY, and so * missed out. */ ddt_bp_fill(ddp, v, bp, txg); zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * There's not enough in the entry yet, so we need to look at * the write in-flight and see how many DVAs it will have once * it completes. * * The in-flight write has potentially had its copies request * reduced (if we're filling out an existing entry), so we need * to reach in and get the original write to find out what it is * expecting. * * Note that the parent of the lead zio will always have the * highest zp_copies of any zio in the chain, because ones that * can be serviced without additional IO are always added to * the back of the chain. */ zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); ASSERT(pio); parent_dvas = pio->io_prop.zp_copies; if (parent_dvas >= need_dvas) { zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * Still not enough, so we will need to issue to get the * shortfall. */ need_dvas -= parent_dvas; } if (is_ganged) { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } /* * We need to write. We will create a new write with the copies * property adjusted to match the number of DVAs we need to need to * grow the DDT entry by to satisfy the request. */ zio_prop_t czp = *zp; if (have_dvas > 0 || parent_dvas > 0) { czp.zp_copies = need_dvas; czp.zp_gang_copies = 0; } else { ASSERT3U(czp.zp_copies, ==, need_dvas); } zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); /* * We are the new lead zio, because our parent has the highest * zp_copies that has been requested for this entry so far. */ ddt_alloc_entry_io(dde); if (dde->dde_io->dde_lead_zio[p] == NULL) { /* * First time out, take a copy of the stable entry to revert * to if there's an error (see zio_ddt_child_write_done()) */ ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); } else { /* * Make the existing chain our child, because it cannot * complete until we have. */ zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); } dde->dde_io->dde_lead_zio[p] = cio; ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); if (v != DDT_PHYS_NONE) ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt); /* * When no entry was found, it must have been pruned, * so we can free it now instead of decrementing the * refcount in the DDT. */ if (!dde) { BP_SET_DEDUP(bp, 0); zio->io_pipeline |= ZIO_STAGE_DVA_FREE; } return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more) { zio_t *zio; ASSERT(MUTEX_HELD(&mca->mca_lock)); zio = avl_first(&mca->mca_tree); if (zio == NULL) { *more = B_FALSE; return (NULL); } ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio, B_FALSE, more)) { return (NULL); } avl_remove(&mca->mca_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); if (avl_is_empty(&mca->mca_tree)) *more = B_FALSE; return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; boolean_t more; /* * If not already chosen, choose an appropriate allocation class. */ mc = zio->io_metaslab_class; if (mc == NULL) mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zio->io_metaslab_class = mc; metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; mutex_enter(&mca->mca_lock); avl_add(&mca->mca_tree, zio); nio = zio_io_to_allocate(mca, &more); mutex_exit(&mca->mca_lock); return (nio); } static void zio_allocate_dispatch(metaslab_class_t *mc, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; zio_t *zio; boolean_t more; do { mutex_enter(&mca->mca_lock); zio = zio_io_to_allocate(mca, &more); mutex_exit(&mca->mca_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } while (more); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } + if (zio->io_flags & ZIO_FLAG_PREALLOCATED) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); + memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva, + 3 * sizeof (dva_t)); + BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig), + BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig)); + return (zio); + } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * If not already chosen, choose an appropriate allocation class. */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; } ZIOSTAT_BUMP(ziostat_total_allocations); again: /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio->io_allocator, zio); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * When the dedup or special class is spilling into the normal * class, there can still be significant space available due * to deferred frees that are in-flight. We track the txg when * this occurred and back off adding new DDT entries for a few * txgs to allow the free blocks to be processed. */ if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) && mc == spa_special_class(spa))) && spa->spa_dedup_class_full_txg != zio->io_txg) { spa->spa_dedup_class_full_txg = zio->io_txg; zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, " "%llu allocated of %llu", spa_name(spa), (int)zio->io_txg, mc == spa_dedup_class(spa) ? "dedup" : "special", (int)zio->io_size, (u_longlong_t)metaslab_class_get_alloc(mc), (u_longlong_t)metaslab_class_get_space(mc)); } /* * If we are holding old class reservation, drop it. * Dispatch the next ZIO(s) there if some are waiting. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { if (metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; } if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_metaslab_class = mc = spa_normal_class(spa); ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); /* * If normal class uses throttling, return to that pipeline * stage. Otherwise just do another allocation attempt. */ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && mc->mc_alloc_throttle_enabled && zio->io_child_type != ZIO_CHILD_GANG && !(zio->io_flags & ZIO_FLAG_NODATA)) { zio->io_stage = ZIO_STAGE_DVA_THROTTLE >> 1; return (zio); } goto again; } if (error == ENOSPC && zio->io_size > spa->spa_min_alloc) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } ZIOSTAT_BUMP(ziostat_gang_writes); if (flags & METASLAB_GANG_CHILD) ZIOSTAT_BUMP(ziostat_gang_multilevel); return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) { metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), B_TRUE); } if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { /* * "no-op" injections return success, but do no actual * work. Just return it. */ zio_delay_interrupt(zio); return (NULL); } } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FLUSH || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { if (zio->io_type != ZIO_TYPE_FLUSH) vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } /* * If a Direct I/O operation has a checksum verify error then this I/O * should not attempt to be issued again. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); ASSERT3U(zio->io_error, ==, EIO); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future, and * then we act like the flush succeeded. */ if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { vd->vdev_nowritecache = B_TRUE; zio->io_error = 0; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ, !(zio->io_flags & ZIO_FLAG_SPECULATIVE)); if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { if (zio->io_flags & ZIO_FLAG_DIO_READ) { zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_t *pio = zio_unique_parent(zio); /* * Any Direct I/O read that has a checksum * error must be treated as suspicous as the * contents of the buffer could be getting * manipulated while the I/O is taking place. * * The checksum verify error will only be * reported here for disk and file VDEV's and * will be reported on those that the failure * occurred on. Other types of VDEV's report the * verify failure in their own code paths. */ if (pio->io_child_type == ZIO_CHILD_LOGICAL) { zio_dio_chksum_verify_error_report(zio); } } else { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } } return (zio); } static zio_t * zio_dio_checksum_verify(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); int error; ASSERT3P(zio->io_vd, !=, NULL); ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0) goto out; if ((error = zio_checksum_error(zio, NULL)) != 0) { zio->io_error = error; if (error == ECKSUM) { zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); } } out: return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * Report Direct I/O checksum verify error and create ZED event. */ void zio_dio_chksum_verify_error_report(zio_t *zio) { ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); if (zio->io_child_type == ZIO_CHILD_LOGICAL) return; mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_dio_verify_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_WRITE) { /* * Convert checksum error for writes into EIO. */ zio->io_error = SET_ERROR(EIO); /* * Report dio_verify_wr ZED event. */ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } else { /* * Report dio_verify_rd ZED event. */ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } #ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; #endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ if (metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); /* * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. */ if (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE)) tag = zio_unique_parent(pio); ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio->io_allocator, flags, pio->io_size, tag); if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) { zio_allocate_dispatch(zio->io_metaslab_class, pio->io_allocator); } } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) zio_dva_throttle_done(zio); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * A DDT child tried to create a mixed gang/non-gang BP. We're * going to have to just retry as a non-dedup IO. */ if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) && zio->io_prop.zp_dedup) { zio->io_reexecute |= ZIO_REEXECUTE_NOW; zio->io_prop.zp_dedup = B_FALSE; } /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * A Direct I/O operation that has a checksum verify error * should not attempt to reexecute. Instead, the error should * just be propagated back. */ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, B_FALSE); } return (NULL); } ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_dio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9373b39a184a..d7f3c75c7948 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -1,1090 +1,1091 @@ # SPDX-License-Identifier: CDDL-1.0 # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains all of the common functional tests. When # adding a new test consider also adding it to the sanity.run file # if the new test runs to completion in only a few seconds. # # Approximate run time: 4-5 hours # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe tags = ['functional'] [tests/functional/acl/off] tests = ['dosmode', 'posixmode'] tags = ['functional', 'acl'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] tests = ['file_append', 'threadsappend_001_pos'] tags = ['functional', 'append'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] [tests/functional/bclone] tests = ['bclone_crossfs_corner_cases_limited', 'bclone_crossfs_data', 'bclone_crossfs_embedded', 'bclone_crossfs_hole', 'bclone_diffprops_all', 'bclone_diffprops_checksum', 'bclone_diffprops_compress', 'bclone_diffprops_copies', 'bclone_diffprops_recordsize', 'bclone_prop_sync', 'bclone_samefs_corner_cases_limited', 'bclone_samefs_data', 'bclone_samefs_embedded', 'bclone_samefs_hole'] tags = ['functional', 'bclone'] timeout = 7200 [tests/functional/block_cloning] tests = ['block_cloning_clone_mmap_cached', 'block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_fallback', 'block_cloning_disabled_copyfilerange', 'block_cloning_copyfilerange_cross_dataset', 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg', 'block_cloning_replay', 'block_cloning_replay_encrypted', 'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write', 'block_cloning_rlimit_fsize', 'block_cloning_large_offset'] tags = ['functional', 'block_cloning'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/btree] tests = ['btree_positive', 'btree_negative'] tags = ['functional', 'btree'] pre = post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test', 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/json] tests = ['json_sanity'] tags = ['functional', 'cli_root', 'json'] [tests/functional/cli_root/zinject] tests = ['zinject_args', 'zinject_counts', 'zinject_probe'] pre = post = tags = ['functional', 'cli_root', 'zinject'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] pre = post = tags = ['functional', 'cli_root', 'zdb'] timeout = 1200 [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested', 'zfs_clone_rm_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zpool_prefetch] tests = ['zpool_prefetch_001_pos'] tags = ['functional', 'cli_root', 'zpool_prefetch'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted', 'zfs_diff_mangle'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_ids_to_path] tests = ['zfs_ids_to_path_001_pos'] tags = ['functional', 'cli_root', 'zfs_ids_to_path'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'receive-o-x_props_override', 'receive-o-x_props_aliases', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props', 'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective', 'zfs_receive_compressed_corrective', 'zfs_receive_large_block_corrective'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation', 'zfs_set_nomount'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares', 'zfs_share_after_mount'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] tests = ['zhack_label_repair_001', 'zhack_label_repair_002', 'zhack_label_repair_003', 'zhack_label_repair_004'] pre = post = tags = ['functional', 'cli_root', 'zhack'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', 'zpool_create_features_007_pos', 'zpool_create_features_008_pos', 'zpool_create_features_009_pos', 'create-o_ashift', 'zpool_create_tempname', 'zpool_create_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates', 'zpool_events_clear_retained'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos', 'vdev_get_all'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', 'import_cachefile_paths_changed', 'import_cachefile_shared_device', 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced', 'zpool_import_status', 'zpool_import_parallel_pos', 'zpool_import_parallel_neg', 'zpool_import_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', 'zpool_initialize_uninit', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_reguid] tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg'] tags = ['functional', 'cli_root', 'zpool_reguid'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', 'zpool_resilver_concurrent'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg', 'zpool_set_clear_userprop'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver', 'zpool_split_indirect', 'zpool_split_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_fault_export_import_online', 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', 'zpool_trim_verify_trimmed'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_discard', 'zpool_wait_freeing', 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', 'zpool_wait_initialize_flag', 'zpool_wait_multiple', 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_005_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/cli_user/zpool_status] tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_status'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/zap_shrink] tests = ['zap_shrink_001_pos'] tags = ['functional', 'zap_shrink'] [tests/functional/crtime] tests = ['crtime_001_pos' ] tags = ['functional', 'crtime'] [tests/functional/crypto] tests = ['icp_aes_ccm', 'icp_aes_gcm'] pre = post = tags = ['functional', 'crypto'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/deadman] tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio'] pre = post = tags = ['functional', 'deadman'] [tests/functional/dedup] tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink'] pre = post = tags = ['functional', 'dedup'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/direct] tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', 'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block', 'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites', 'dio_property', 'dio_random', 'dio_read_verify', 'dio_recordsize', 'dio_unaligned_block', 'dio_unaligned_filesize'] tags = ['functional', 'direct'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/fallocate] tests = ['fallocate_punch-hole'] tags = ['functional', 'fallocate'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] -tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies'] +tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', + 'gang_blocks_ddt_copies'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] pre = post = tests = ['hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['mmap', 'posixaio', 'psync', 'sync'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/limits] tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', 'snapshot_limit'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_001', 'link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos', 'mmap_write_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df', 'enospc_ganging', 'enospc_rm'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', 'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume', 'redacted_size', 'redacted_volume'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg', 'raidz_expand_007_neg'] tags = ['functional', 'raidz'] timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', 'redundancy_draid_damaged1', 'redundancy_draid_damaged2', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] timeout = 1200 [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg', 'refquota_008_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', 'removal_with_indirect', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', 'remove_indirect', 'remove_attach_mirror', 'removal_reservation', 'removal_with_hole'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'rsend_025_pos', 'rsend_026_neg', 'rsend_027_pos', 'rsend_028_neg', 'rsend_029_neg', 'rsend_030_pos', 'rsend_031_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstream_recompress', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_incremental', 'send_encrypted_freeobjects', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid', 'send_doall', 'send_raw_spill_block', 'send_raw_ashift', 'send_raw_large_blocks'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', 'slog_replay_fs_002', 'slog_replay_volume', 'slog_016_pos'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos', 'snapshot_018_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/stat] tests = ['stat_001_pos', 'statx_dioalign'] tags = ['functional', 'stat'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none', 'suid_write_zil_replay'] tags = ['functional', 'suid'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/userquota] tests = [ 'defaultuserquota_001_pos', 'defaultuserquota_002_pos', 'defaultuserquota_003_pos', 'defaultuserquota_004_neg', 'defaultuserquota_005_pos', 'defaultuserquota_006_pos', 'defaultuserquota_007_pos', 'defaultuserquota_008_pos', 'defaultuserquota_009_pos', 'defaultuserquota_010_neg', 'defaultuserquota_011_neg', 'defaultuserquota_012_neg', 'defaultuserquota_013_neg', 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userspace_001_pos', 'userspace_002_pos', 'userspace_004_pos', 'userspace_encrypted', 'userspace_send_encrypted', 'userspace_encrypted_13709'] tags = ['functional', 'userquota'] [tests/functional/vdev_disk:Linux] pre = post = tests = ['page_alignment'] tags = ['functional', 'vdev_disk'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_stress] tests = ['zvol_stress'] tags = ['functional', 'zvol', 'zvol_stress'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/l2arc] tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos', 'persist_l2arc_001_pos', 'persist_l2arc_002_pos', 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos'] tags = ['functional', 'l2arc'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index ddd2d431a5b6..6362a2606260 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -1,640 +1,644 @@ # SPDX-License-Identifier: CDDL-1.0 # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains a subset of functional tests which exercise # as much functionality as possible while still executing relatively # quickly. The included tests should take no more than a few seconds # each to run at most. This provides a convenient way to sanity test a # change before committing to a full test run which takes several hours. # # Approximate run time: 15 minutes # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 180 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe tags = ['functional'] [tests/functional/acl/off] tests = ['posixmode'] tags = ['functional', 'acl'] [tests/functional/alloc_class] tests = ['alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_008_pos', 'alloc_class_010_pos', 'alloc_class_011_neg'] tags = ['functional', 'alloc_class'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/bootfs] tests = ['bootfs_004_neg', 'bootfs_007_pos'] tags = ['functional', 'bootfs'] [tests/functional/cache] tests = ['cache_004_neg', 'cache_005_neg', 'cache_007_neg', 'cache_010_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'sensitive_none_lookup', 'sensitive_none_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'mixed_none_lookup', 'mixed_none_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy'] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/cli_root/zdb] tests = ['zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_encrypted'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_dryrun', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_cliargs', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_003_pos', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_encrypted', 'zfs_send_raw'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation', 'zfs_set_nomount'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_002_neg'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_encrypted', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_follow', 'zpool_events_poolname'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_003_pos', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_014_pos', 'zpool_import_features_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_online_offline'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_003_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_009_neg'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_no_activity', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/compression] tests = ['compress_003_pos','compress_zstd_bswap'] tags = ['functional', 'compression'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/features/large_dnode] tests = ['large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg'] tags = ['functional', 'features', 'large_dnode'] +[tests/functional/gang_blocks] +tests = ['gang_blocks_001_pos'] +tags = ['functional', 'gang_blocks'] + [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_004_pos', 'history_005_neg', 'history_007_pos', 'history_009_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] pre = post = tests = ['hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/limits] tests = ['filesystem_count', 'snapshot_count'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_read_001_pos'] tags = ['functional', 'mmap'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/nopwrite] tests = ['nopwrite_sync', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/pool_checkpoint] tests = ['checkpoint_conf_change', 'checkpoint_discard_many', 'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_mixed_recsize', 'redacted_negative', 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg'] tags = ['functional', 'raidz'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_005_pos', 'refreserv_multi_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_sanity', 'removal_with_dedup', 'removal_with_ganging', 'removal_with_faulted'] tags = ['functional', 'removal'] [tests/functional/replacement] tests = ['rebuild_raidz'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_014_pos', 'rsend_016_neg', 'send-c_verify_contents', 'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_freeobjects', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_008_neg', 'slog_009_neg', 'slog_010_neg'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos', 'snapshot_018_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_002_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none'] tags = ['functional', 'suid'] [tests/functional/append] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/vdev_disk:Linux] pre = post = tests = ['page_alignment'] tags = ['functional', 'vdev_disk'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_013_pos', 'xattr_compat'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db1ef0d03aaf..4c102b3aa1b8 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1,2225 +1,2226 @@ CLEANFILES = dist_noinst_DATA = include $(top_srcdir)/config/Substfiles.am datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests nobase_dist_datadir_zfs_tests_tests_DATA = \ perf/nfs-sample.cfg \ perf/perf.shlib \ \ perf/fio/mkfiles.fio \ perf/fio/random_reads.fio \ perf/fio/random_readwrite.fio \ perf/fio/random_readwrite_fixed.fio \ perf/fio/random_writes.fio \ perf/fio/sequential_reads.fio \ perf/fio/sequential_readwrite.fio \ perf/fio/sequential_writes.fio nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \ perf/regression/random_reads.ksh \ perf/regression/random_readwrite.ksh \ perf/regression/random_readwrite_fixed.ksh \ perf/regression/random_writes.ksh \ perf/regression/random_writes_zil.ksh \ perf/regression/sequential_reads_arc_cached_clone.ksh \ perf/regression/sequential_reads_arc_cached.ksh \ perf/regression/sequential_reads_dbuf_cached.ksh \ perf/regression/sequential_reads.ksh \ perf/regression/sequential_writes.ksh \ perf/regression/setup.ksh \ \ perf/scripts/prefetch_io.sh # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source: # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # # simd and tmpfile are Linux-only and not installed elsewhere # # C programs are specced in ../Makefile.am above as part of the main Makefile find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' regen: @$(MAKE) -C $(top_builddir) clean @$(MAKE) clean $(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am echo >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am $(find_common) ! -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am $(find_common) -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am echo >> Makefile.am echo 'if BUILD_LINUX' >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'endif' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am $(find_common) ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am # -- >8 -- nobase_nodist_datadir_zfs_tests_tests_DATA = \ functional/pam/utilities.kshlib nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \ functional/pyzfs/pyzfs_unittest.ksh SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS) if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ functional/tmpfile/setup.ksh \ functional/luks/luks_sanity.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl.cfg \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/bclone/bclone.cfg \ functional/bclone/bclone_common.kshlib \ functional/bclone/bclone_corner_cases.kshlib \ functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ functional/cachefile/cachefile.cfg \ functional/cachefile/cachefile.kshlib \ functional/casenorm/casenorm.cfg \ functional/casenorm/casenorm.kshlib \ functional/channel_program/channel_common.kshlib \ functional/channel_program/lua_core/tst.args_to_lua.out \ functional/channel_program/lua_core/tst.args_to_lua.zcp \ functional/channel_program/lua_core/tst.divide_by_zero.err \ functional/channel_program/lua_core/tst.divide_by_zero.zcp \ functional/channel_program/lua_core/tst.exists.zcp \ functional/channel_program/lua_core/tst.large_prog.out \ functional/channel_program/lua_core/tst.large_prog.zcp \ functional/channel_program/lua_core/tst.lib_base.lua \ functional/channel_program/lua_core/tst.lib_coroutine.lua \ functional/channel_program/lua_core/tst.lib_strings.lua \ functional/channel_program/lua_core/tst.lib_table.lua \ functional/channel_program/lua_core/tst.nested_neg.zcp \ functional/channel_program/lua_core/tst.nested_pos.zcp \ functional/channel_program/lua_core/tst.recursive.zcp \ functional/channel_program/lua_core/tst.return_large.zcp \ functional/channel_program/lua_core/tst.return_recursive_table.zcp \ functional/channel_program/lua_core/tst.stack_gsub.err \ functional/channel_program/lua_core/tst.stack_gsub.zcp \ functional/channel_program/lua_core/tst.timeout.zcp \ functional/channel_program/synctask_core/tst.bookmark.copy.zcp \ functional/channel_program/synctask_core/tst.bookmark.create.zcp \ functional/channel_program/synctask_core/tst.get_index_props.out \ functional/channel_program/synctask_core/tst.get_index_props.zcp \ functional/channel_program/synctask_core/tst.get_number_props.out \ functional/channel_program/synctask_core/tst.get_number_props.zcp \ functional/channel_program/synctask_core/tst.get_string_props.out \ functional/channel_program/synctask_core/tst.get_string_props.zcp \ functional/channel_program/synctask_core/tst.promote_conflict.zcp \ functional/channel_program/synctask_core/tst.set_props.zcp \ functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \ functional/channel_program/synctask_core/tst.snapshot_neg.zcp \ functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \ functional/channel_program/synctask_core/tst.snapshot_rename.zcp \ functional/channel_program/synctask_core/tst.snapshot_simple.zcp \ functional/checksum/default.cfg \ functional/clean_mirror/clean_mirror_common.kshlib \ functional/clean_mirror/default.cfg \ functional/crypto/aes_ccm_test.json \ functional/crypto/aes_ccm_test.txt \ functional/crypto/aes_gcm_test.json \ functional/crypto/aes_gcm_test.txt \ functional/cli_root/cli_common.kshlib \ functional/cli_root/zfs_copies/zfs_copies.cfg \ functional/cli_root/zfs_copies/zfs_copies.kshlib \ functional/cli_root/zfs_create/properties.kshlib \ functional/cli_root/zfs_create/zfs_create.cfg \ functional/cli_root/zfs_create/zfs_create_common.kshlib \ functional/cli_root/zfs_destroy/zfs_destroy.cfg \ functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \ functional/cli_root/zfs_get/zfs_get_common.kshlib \ functional/cli_root/zfs_get/zfs_get_list_d.kshlib \ functional/cli_root/zfs_jail/jail.conf \ functional/cli_root/zfs_load-key/HEXKEY \ functional/cli_root/zfs_load-key/PASSPHRASE \ functional/cli_root/zfs_load-key/RAWKEY \ functional/cli_root/zfs_load-key/zfs_load-key.cfg \ functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \ functional/cli_root/zfs_mount/zfs_mount.cfg \ functional/cli_root/zfs_mount/zfs_mount.kshlib \ functional/cli_root/zfs_promote/zfs_promote.cfg \ functional/cli_root/zfs_receive/zstd_test_data.txt \ functional/cli_root/zfs_rename/zfs_rename.cfg \ functional/cli_root/zfs_rename/zfs_rename.kshlib \ functional/cli_root/zfs_rollback/zfs_rollback.cfg \ functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \ functional/cli_root/zfs_send/zfs_send.cfg \ functional/cli_root/zfs_set/zfs_set_common.kshlib \ functional/cli_root/zfs_share/zfs_share.cfg \ functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.kshlib \ functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \ functional/cli_root/zfs_wait/zfs_wait.kshlib \ functional/cli_root/zpool_add/zpool_add.cfg \ functional/cli_root/zpool_add/zpool_add.kshlib \ functional/cli_root/zpool_clear/zpool_clear.cfg \ functional/cli_root/zpool_create/draidcfg.gz \ functional/cli_root/zpool_create/zpool_create.cfg \ functional/cli_root/zpool_create/zpool_create.shlib \ functional/cli_root/zpool_destroy/zpool_destroy.cfg \ functional/cli_root/zpool_events/zpool_events.cfg \ functional/cli_root/zpool_events/zpool_events.kshlib \ functional/cli_root/zpool_expand/zpool_expand.cfg \ functional/cli_root/zpool_export/zpool_export.cfg \ functional/cli_root/zpool_export/zpool_export.kshlib \ functional/cli_root/zpool_get/vdev_get.cfg \ functional/cli_root/zpool_get/zpool_get.cfg \ functional/cli_root/zpool_get/zpool_get_parsable.cfg \ functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ functional/cli_root/zpool_resilver/zpool_resilver.cfg \ functional/cli_root/zpool_scrub/zpool_scrub.cfg \ functional/cli_root/zpool_split/zpool_split.cfg \ functional/cli_root/zpool_trim/zpool_trim.kshlib \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ functional/compression/compress.cfg \ functional/compression/testpool_zstd.tar.gz \ functional/deadman/deadman.cfg \ functional/delegate/delegate.cfg \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ functional/direct/dio.cfg \ functional/direct/dio.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ functional/gang_blocks/gang_blocks.kshlib \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ functional/history/i386.migratedpool.DAT.Z \ functional/history/i386.orig_history.txt \ functional/history/sparc.migratedpool.DAT.Z \ functional/history/sparc.orig_history.txt \ functional/history/zfs-pool-v4.dat.Z \ functional/inheritance/config001.cfg \ functional/inheritance/config002.cfg \ functional/inheritance/config003.cfg \ functional/inheritance/config004.cfg \ functional/inheritance/config005.cfg \ functional/inheritance/config006.cfg \ functional/inheritance/config007.cfg \ functional/inheritance/config008.cfg \ functional/inheritance/config009.cfg \ functional/inheritance/config010.cfg \ functional/inheritance/config011.cfg \ functional/inheritance/config012.cfg \ functional/inheritance/config013.cfg \ functional/inheritance/config014.cfg \ functional/inheritance/config015.cfg \ functional/inheritance/config016.cfg \ functional/inheritance/config017.cfg \ functional/inheritance/config018.cfg \ functional/inheritance/config019.cfg \ functional/inheritance/config020.cfg \ functional/inheritance/config021.cfg \ functional/inheritance/config022.cfg \ functional/inheritance/config023.cfg \ functional/inheritance/config024.cfg \ functional/inheritance/inherit.kshlib \ functional/inheritance/README.config \ functional/inheritance/README.state \ functional/inheritance/state001.cfg \ functional/inheritance/state002.cfg \ functional/inheritance/state003.cfg \ functional/inheritance/state004.cfg \ functional/inheritance/state005.cfg \ functional/inheritance/state006.cfg \ functional/inheritance/state007.cfg \ functional/inheritance/state008.cfg \ functional/inheritance/state009.cfg \ functional/inheritance/state010.cfg \ functional/inheritance/state011.cfg \ functional/inheritance/state012.cfg \ functional/inheritance/state013.cfg \ functional/inheritance/state014.cfg \ functional/inheritance/state015.cfg \ functional/inheritance/state016.cfg \ functional/inheritance/state017.cfg \ functional/inheritance/state018.cfg \ functional/inheritance/state019.cfg \ functional/inheritance/state020.cfg \ functional/inheritance/state021.cfg \ functional/inheritance/state022.cfg \ functional/inheritance/state023.cfg \ functional/inheritance/state024.cfg \ functional/inuse/inuse.cfg \ functional/io/io.cfg \ functional/l2arc/l2arc.cfg \ functional/largest_pool/largest_pool.cfg \ functional/migration/migration.cfg \ functional/migration/migration.kshlib \ functional/mmap/mmap.cfg \ functional/mmp/mmp.cfg \ functional/mmp/mmp.kshlib \ functional/mv_files/mv_files.cfg \ functional/mv_files/mv_files_common.kshlib \ functional/nopwrite/nopwrite.shlib \ functional/no_space/enospc.cfg \ functional/online_offline/online_offline.cfg \ functional/pool_checkpoint/pool_checkpoint.kshlib \ functional/projectquota/projectquota.cfg \ functional/projectquota/projectquota_common.kshlib \ functional/quota/quota.cfg \ functional/quota/quota.kshlib \ functional/redacted_send/redacted.cfg \ functional/redacted_send/redacted.kshlib \ functional/redundancy/redundancy.cfg \ functional/redundancy/redundancy.kshlib \ functional/refreserv/refreserv.cfg \ functional/removal/removal.kshlib \ functional/replacement/replacement.cfg \ functional/reservation/reservation.cfg \ functional/reservation/reservation.shlib \ functional/rsend/dedup_encrypted_zvol.bz2 \ functional/rsend/dedup_encrypted_zvol.zsend.bz2 \ functional/rsend/dedup.zsend.bz2 \ functional/rsend/fs.tar.gz \ functional/rsend/rsend.cfg \ functional/rsend/rsend.kshlib \ functional/scrub_mirror/default.cfg \ functional/scrub_mirror/scrub_mirror_common.kshlib \ functional/slog/slog.cfg \ functional/slog/slog.kshlib \ functional/snapshot/snapshot.cfg \ functional/snapused/snapused.kshlib \ functional/sparse/sparse.cfg \ functional/trim/trim.cfg \ functional/trim/trim.kshlib \ functional/truncate/truncate.cfg \ functional/upgrade/upgrade_common.kshlib \ functional/user_namespace/user_namespace.cfg \ functional/user_namespace/user_namespace_common.kshlib \ functional/userquota/13709_reproducer.bz2 \ functional/userquota/userquota.cfg \ functional/userquota/userquota_common.kshlib \ functional/vdev_zaps/vdev_zaps.kshlib \ functional/xattr/xattr.cfg \ functional/xattr/xattr_common.kshlib \ functional/zvol/zvol.cfg \ functional/zvol/zvol_cli/zvol_cli.cfg \ functional/zvol/zvol_common.shlib \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \ functional/zvol/zvol_misc/zvol_misc_common.kshlib \ functional/zvol/zvol_swap/zvol_swap.cfg \ functional/idmap_mount/idmap_mount.cfg \ functional/idmap_mount/idmap_mount_common.kshlib nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/acl/off/cleanup.ksh \ functional/acl/off/dosmode.ksh \ functional/acl/off/posixmode.ksh \ functional/acl/off/setup.ksh \ functional/acl/posix/cleanup.ksh \ functional/acl/posix/posix_001_pos.ksh \ functional/acl/posix/posix_002_pos.ksh \ functional/acl/posix/posix_003_pos.ksh \ functional/acl/posix/posix_004_pos.ksh \ functional/acl/posix-sa/cleanup.ksh \ functional/acl/posix-sa/posix_001_pos.ksh \ functional/acl/posix-sa/posix_002_pos.ksh \ functional/acl/posix-sa/posix_003_pos.ksh \ functional/acl/posix-sa/posix_004_pos.ksh \ functional/acl/posix-sa/setup.ksh \ functional/acl/posix/setup.ksh \ functional/alloc_class/alloc_class_001_pos.ksh \ functional/alloc_class/alloc_class_002_neg.ksh \ functional/alloc_class/alloc_class_003_pos.ksh \ functional/alloc_class/alloc_class_004_pos.ksh \ functional/alloc_class/alloc_class_005_pos.ksh \ functional/alloc_class/alloc_class_006_pos.ksh \ functional/alloc_class/alloc_class_007_pos.ksh \ functional/alloc_class/alloc_class_008_pos.ksh \ functional/alloc_class/alloc_class_009_pos.ksh \ functional/alloc_class/alloc_class_010_pos.ksh \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ functional/alloc_class/alloc_class_014_neg.ksh \ functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ functional/append/setup.ksh \ functional/arc/arcstats_runtime_tuning.ksh \ functional/arc/cleanup.ksh \ functional/arc/dbufstats_001_pos.ksh \ functional/arc/dbufstats_002_pos.ksh \ functional/arc/dbufstats_003_pos.ksh \ functional/arc/setup.ksh \ functional/atime/atime_001_pos.ksh \ functional/atime/atime_002_neg.ksh \ functional/atime/atime_003_pos.ksh \ functional/atime/cleanup.ksh \ functional/atime/root_atime_off.ksh \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ functional/bclone/bclone_crossfs_corner_cases.ksh \ functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ functional/bclone/bclone_crossfs_data.ksh \ functional/bclone/bclone_crossfs_embedded.ksh \ functional/bclone/bclone_crossfs_hole.ksh \ functional/bclone/bclone_diffprops_all.ksh \ functional/bclone/bclone_diffprops_checksum.ksh \ functional/bclone/bclone_diffprops_compress.ksh \ functional/bclone/bclone_diffprops_copies.ksh \ functional/bclone/bclone_diffprops_recordsize.ksh \ functional/bclone/bclone_prop_sync.ksh \ functional/bclone/bclone_samefs_corner_cases.ksh \ functional/bclone/bclone_samefs_corner_cases_limited.ksh \ functional/bclone/bclone_samefs_data.ksh \ functional/bclone/bclone_samefs_embedded.ksh \ functional/bclone/bclone_samefs_hole.ksh \ functional/bclone/cleanup.ksh \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_clone_mmap_cached.ksh \ functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/block_cloning/block_cloning_replay.ksh \ functional/block_cloning/block_cloning_replay_encrypted.ksh \ functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \ functional/block_cloning/block_cloning_rlimit_fsize.ksh \ functional/block_cloning/block_cloning_large_offset.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ functional/bootfs/bootfs_004_neg.ksh \ functional/bootfs/bootfs_005_neg.ksh \ functional/bootfs/bootfs_006_pos.ksh \ functional/bootfs/bootfs_007_pos.ksh \ functional/bootfs/bootfs_008_pos.ksh \ functional/bootfs/cleanup.ksh \ functional/bootfs/setup.ksh \ functional/btree/btree_negative.ksh \ functional/btree/btree_positive.ksh \ functional/cache/cache_001_pos.ksh \ functional/cache/cache_002_pos.ksh \ functional/cache/cache_003_pos.ksh \ functional/cache/cache_004_neg.ksh \ functional/cache/cache_005_neg.ksh \ functional/cache/cache_006_pos.ksh \ functional/cache/cache_007_neg.ksh \ functional/cache/cache_008_neg.ksh \ functional/cache/cache_009_pos.ksh \ functional/cache/cache_010_pos.ksh \ functional/cache/cache_011_pos.ksh \ functional/cache/cache_012_pos.ksh \ functional/cache/cleanup.ksh \ functional/cachefile/cachefile_001_pos.ksh \ functional/cachefile/cachefile_002_pos.ksh \ functional/cachefile/cachefile_003_pos.ksh \ functional/cachefile/cachefile_004_pos.ksh \ functional/cachefile/cleanup.ksh \ functional/cachefile/setup.ksh \ functional/cache/setup.ksh \ functional/casenorm/case_all_values.ksh \ functional/casenorm/cleanup.ksh \ functional/casenorm/insensitive_formd_delete.ksh \ functional/casenorm/insensitive_formd_lookup.ksh \ functional/casenorm/insensitive_none_delete.ksh \ functional/casenorm/insensitive_none_lookup.ksh \ functional/casenorm/mixed_create_failure.ksh \ functional/casenorm/mixed_formd_delete.ksh \ functional/casenorm/mixed_formd_lookup_ci.ksh \ functional/casenorm/mixed_formd_lookup.ksh \ functional/casenorm/mixed_none_delete.ksh \ functional/casenorm/mixed_none_lookup_ci.ksh \ functional/casenorm/mixed_none_lookup.ksh \ functional/casenorm/norm_all_values.ksh \ functional/casenorm/sensitive_formd_delete.ksh \ functional/casenorm/sensitive_formd_lookup.ksh \ functional/casenorm/sensitive_none_delete.ksh \ functional/casenorm/sensitive_none_lookup.ksh \ functional/casenorm/setup.ksh \ functional/channel_program/lua_core/cleanup.ksh \ functional/channel_program/lua_core/setup.ksh \ functional/channel_program/lua_core/tst.args_to_lua.ksh \ functional/channel_program/lua_core/tst.divide_by_zero.ksh \ functional/channel_program/lua_core/tst.exists.ksh \ functional/channel_program/lua_core/tst.integer_illegal.ksh \ functional/channel_program/lua_core/tst.integer_overflow.ksh \ functional/channel_program/lua_core/tst.language_functions_neg.ksh \ functional/channel_program/lua_core/tst.language_functions_pos.ksh \ functional/channel_program/lua_core/tst.large_prog.ksh \ functional/channel_program/lua_core/tst.libraries.ksh \ functional/channel_program/lua_core/tst.memory_limit.ksh \ functional/channel_program/lua_core/tst.nested_neg.ksh \ functional/channel_program/lua_core/tst.nested_pos.ksh \ functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \ functional/channel_program/lua_core/tst.recursive_neg.ksh \ functional/channel_program/lua_core/tst.recursive_pos.ksh \ functional/channel_program/lua_core/tst.return_large.ksh \ functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \ functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \ functional/channel_program/lua_core/tst.return_recursive_table.ksh \ functional/channel_program/lua_core/tst.stack_gsub.ksh \ functional/channel_program/lua_core/tst.timeout.ksh \ functional/channel_program/synctask_core/cleanup.ksh \ functional/channel_program/synctask_core/setup.ksh \ functional/channel_program/synctask_core/tst.bookmark.copy.ksh \ functional/channel_program/synctask_core/tst.bookmark.create.ksh \ functional/channel_program/synctask_core/tst.destroy_fs.ksh \ functional/channel_program/synctask_core/tst.destroy_snap.ksh \ functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \ functional/channel_program/synctask_core/tst.get_index_props.ksh \ functional/channel_program/synctask_core/tst.get_mountpoint.ksh \ functional/channel_program/synctask_core/tst.get_neg.ksh \ functional/channel_program/synctask_core/tst.get_number_props.ksh \ functional/channel_program/synctask_core/tst.get_string_props.ksh \ functional/channel_program/synctask_core/tst.get_type.ksh \ functional/channel_program/synctask_core/tst.get_userquota.ksh \ functional/channel_program/synctask_core/tst.get_written.ksh \ functional/channel_program/synctask_core/tst.inherit.ksh \ functional/channel_program/synctask_core/tst.list_bookmarks.ksh \ functional/channel_program/synctask_core/tst.list_children.ksh \ functional/channel_program/synctask_core/tst.list_clones.ksh \ functional/channel_program/synctask_core/tst.list_holds.ksh \ functional/channel_program/synctask_core/tst.list_snapshots.ksh \ functional/channel_program/synctask_core/tst.list_system_props.ksh \ functional/channel_program/synctask_core/tst.list_user_props.ksh \ functional/channel_program/synctask_core/tst.parse_args_neg.ksh \ functional/channel_program/synctask_core/tst.promote_conflict.ksh \ functional/channel_program/synctask_core/tst.promote_multiple.ksh \ functional/channel_program/synctask_core/tst.promote_simple.ksh \ functional/channel_program/synctask_core/tst.rollback_mult.ksh \ functional/channel_program/synctask_core/tst.rollback_one.ksh \ functional/channel_program/synctask_core/tst.set_props.ksh \ functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \ functional/channel_program/synctask_core/tst.snapshot_neg.ksh \ functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \ functional/channel_program/synctask_core/tst.snapshot_rename.ksh \ functional/channel_program/synctask_core/tst.snapshot_simple.ksh \ functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \ functional/chattr/chattr_001_pos.ksh \ functional/chattr/chattr_002_neg.ksh \ functional/chattr/cleanup.ksh \ functional/chattr/setup.ksh \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ functional/checksum/setup.ksh \ functional/clean_mirror/clean_mirror_001_pos.ksh \ functional/clean_mirror/clean_mirror_002_pos.ksh \ functional/clean_mirror/clean_mirror_003_pos.ksh \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ functional/cli_root/json/cleanup.ksh \ functional/cli_root/json/setup.ksh \ functional/cli_root/json/json_sanity.ksh \ functional/cli_root/zinject/zinject_args.ksh \ functional/cli_root/zinject/zinject_counts.ksh \ functional/cli_root/zinject/zinject_probe.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ functional/cli_root/zdb/zdb_005_pos.ksh \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ functional/cli_root/zdb/zdb_objset_id.ksh \ functional/cli_root/zdb/zdb_recover_2.ksh \ functional/cli_root/zdb/zdb_recover.ksh \ functional/cli_root/zfs_bookmark/cleanup.ksh \ functional/cli_root/zfs_bookmark/setup.ksh \ functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \ functional/cli_root/zfs_change-key/cleanup.ksh \ functional/cli_root/zfs_change-key/setup.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \ functional/cli_root/zfs_change-key/zfs_change-key.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \ functional/cli_root/zfs/cleanup.ksh \ functional/cli_root/zfs_clone/cleanup.ksh \ functional/cli_root/zfs_clone/setup.ksh \ functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \ functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \ functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \ functional/cli_root/zfs_copies/cleanup.ksh \ functional/cli_root/zfs_copies/setup.ksh \ functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \ functional/cli_root/zfs_create/cleanup.ksh \ functional/cli_root/zfs_create/setup.ksh \ functional/cli_root/zfs_create/zfs_create_001_pos.ksh \ functional/cli_root/zfs_create/zfs_create_002_pos.ksh \ functional/cli_root/zfs_create/zfs_create_003_pos.ksh \ functional/cli_root/zfs_create/zfs_create_004_pos.ksh \ functional/cli_root/zfs_create/zfs_create_005_pos.ksh \ functional/cli_root/zfs_create/zfs_create_006_pos.ksh \ functional/cli_root/zfs_create/zfs_create_007_pos.ksh \ functional/cli_root/zfs_create/zfs_create_008_neg.ksh \ functional/cli_root/zfs_create/zfs_create_009_neg.ksh \ functional/cli_root/zfs_create/zfs_create_010_neg.ksh \ functional/cli_root/zfs_create/zfs_create_011_pos.ksh \ functional/cli_root/zfs_create/zfs_create_012_pos.ksh \ functional/cli_root/zfs_create/zfs_create_013_pos.ksh \ functional/cli_root/zfs_create/zfs_create_014_pos.ksh \ functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \ functional/cli_root/zfs_create/zfs_create_dryrun.ksh \ functional/cli_root/zfs_create/zfs_create_encrypted.ksh \ functional/cli_root/zfs_create/zfs_create_nomount.ksh \ functional/cli_root/zfs_create/zfs_create_verbose.ksh \ functional/cli_root/zfs_destroy/cleanup.ksh \ functional/cli_root/zfs_destroy/setup.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \ functional/cli_root/zfs_diff/cleanup.ksh \ functional/cli_root/zfs_diff/setup.ksh \ functional/cli_root/zfs_diff/zfs_diff_changes.ksh \ functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \ functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \ functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \ functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \ functional/cli_root/zfs_diff/zfs_diff_types.ksh \ functional/cli_root/zfs_get/cleanup.ksh \ functional/cli_root/zfs_get/setup.ksh \ functional/cli_root/zfs_get/zfs_get_001_pos.ksh \ functional/cli_root/zfs_get/zfs_get_002_pos.ksh \ functional/cli_root/zfs_get/zfs_get_003_pos.ksh \ functional/cli_root/zfs_get/zfs_get_004_pos.ksh \ functional/cli_root/zfs_get/zfs_get_005_neg.ksh \ functional/cli_root/zfs_get/zfs_get_006_neg.ksh \ functional/cli_root/zfs_get/zfs_get_007_neg.ksh \ functional/cli_root/zfs_get/zfs_get_008_pos.ksh \ functional/cli_root/zfs_get/zfs_get_009_pos.ksh \ functional/cli_root/zfs_get/zfs_get_010_neg.ksh \ functional/cli_root/zfs_ids_to_path/cleanup.ksh \ functional/cli_root/zfs_ids_to_path/setup.ksh \ functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \ functional/cli_root/zfs_inherit/cleanup.ksh \ functional/cli_root/zfs_inherit/setup.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \ functional/cli_root/zfs_jail/cleanup.ksh \ functional/cli_root/zfs_jail/setup.ksh \ functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \ functional/cli_root/zfs_load-key/cleanup.ksh \ functional/cli_root/zfs_load-key/setup.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \ functional/cli_root/zfs_load-key/zfs_load-key.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \ functional/cli_root/zfs_mount/cleanup.ksh \ functional/cli_root/zfs_mount/setup.ksh \ functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ functional/cli_root/zfs_program/cleanup.ksh \ functional/cli_root/zfs_program/setup.ksh \ functional/cli_root/zfs_program/zfs_program_json.ksh \ functional/cli_root/zfs_promote/cleanup.ksh \ functional/cli_root/zfs_promote/setup.ksh \ functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \ functional/cli_root/zfs_property/cleanup.ksh \ functional/cli_root/zfs_property/setup.ksh \ functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \ functional/cli_root/zfs_receive/cleanup.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \ functional/cli_root/zfs_receive/setup.ksh \ functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_-e.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \ functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw.ksh \ functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \ functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \ functional/cli_root/zfs_rename/cleanup.ksh \ functional/cli_root/zfs_rename/setup.ksh \ functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \ functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \ functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \ functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \ functional/cli_root/zfs_reservation/cleanup.ksh \ functional/cli_root/zfs_reservation/setup.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \ functional/cli_root/zfs_send/cleanup.ksh \ functional/cli_root/zfs_send/setup.ksh \ functional/cli_root/zfs_send/zfs_send_001_pos.ksh \ functional/cli_root/zfs_send/zfs_send_002_pos.ksh \ functional/cli_root/zfs_send/zfs_send_003_pos.ksh \ functional/cli_root/zfs_send/zfs_send_004_neg.ksh \ functional/cli_root/zfs_send/zfs_send_005_pos.ksh \ functional/cli_root/zfs_send/zfs_send_006_pos.ksh \ functional/cli_root/zfs_send/zfs_send_007_pos.ksh \ functional/cli_root/zfs_send/zfs_send-b.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ functional/cli_root/zfs_set/canmount_002_pos.ksh \ functional/cli_root/zfs_set/canmount_003_pos.ksh \ functional/cli_root/zfs_set/canmount_004_pos.ksh \ functional/cli_root/zfs_set/checksum_001_pos.ksh \ functional/cli_root/zfs_set/cleanup.ksh \ functional/cli_root/zfs_set/compression_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_002_pos.ksh \ functional/cli_root/zfs_set/mountpoint_003_pos.ksh \ functional/cli_root/zfs_set/onoffs_001_pos.ksh \ functional/cli_root/zfs_set/property_alias_001_pos.ksh \ functional/cli_root/zfs_set/readonly_001_pos.ksh \ functional/cli_root/zfs_set/reservation_001_neg.ksh \ functional/cli_root/zfs_set/ro_props_001_pos.ksh \ functional/cli_root/zfs_set/setup.ksh \ functional/cli_root/zfs_set/share_mount_001_neg.ksh \ functional/cli_root/zfs_set/snapdir_001_pos.ksh \ functional/cli_root/zfs/setup.ksh \ functional/cli_root/zfs_set/user_property_001_pos.ksh \ functional/cli_root/zfs_set/user_property_002_pos.ksh \ functional/cli_root/zfs_set/user_property_003_neg.ksh \ functional/cli_root/zfs_set/user_property_004_pos.ksh \ functional/cli_root/zfs_set/version_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_002_neg.ksh \ functional/cli_root/zfs_set/zfs_set_003_neg.ksh \ functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \ functional/cli_root/zfs_set/zfs_set_keylocation.ksh \ functional/cli_root/zfs_set/zfs_set_nomount.ksh \ functional/cli_root/zfs_share/cleanup.ksh \ functional/cli_root/zfs_share/setup.ksh \ functional/cli_root/zfs_share/zfs_share_001_pos.ksh \ functional/cli_root/zfs_share/zfs_share_002_pos.ksh \ functional/cli_root/zfs_share/zfs_share_003_pos.ksh \ functional/cli_root/zfs_share/zfs_share_004_pos.ksh \ functional/cli_root/zfs_share/zfs_share_005_pos.ksh \ functional/cli_root/zfs_share/zfs_share_006_pos.ksh \ functional/cli_root/zfs_share/zfs_share_007_neg.ksh \ functional/cli_root/zfs_share/zfs_share_008_neg.ksh \ functional/cli_root/zfs_share/zfs_share_009_neg.ksh \ functional/cli_root/zfs_share/zfs_share_010_neg.ksh \ functional/cli_root/zfs_share/zfs_share_011_pos.ksh \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ functional/cli_root/zfs_share/zfs_share_after_mount.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \ functional/cli_root/zfs_sysfs/cleanup.ksh \ functional/cli_root/zfs_sysfs/setup.ksh \ functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \ functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \ functional/cli_root/zfs_unload-key/cleanup.ksh \ functional/cli_root/zfs_unload-key/setup.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \ functional/cli_root/zfs_unmount/cleanup.ksh \ functional/cli_root/zfs_unmount/setup.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \ functional/cli_root/zfs_unshare/cleanup.ksh \ functional/cli_root/zfs_unshare/setup.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \ functional/cli_root/zfs_upgrade/cleanup.ksh \ functional/cli_root/zfs_upgrade/setup.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \ functional/cli_root/zfs_wait/cleanup.ksh \ functional/cli_root/zfs_wait/setup.ksh \ functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \ functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ functional/cli_root/zhack/zhack_label_repair_001.ksh \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ functional/cli_root/zpool_add/zpool_add_004_pos.ksh \ functional/cli_root/zpool_add/zpool_add_005_pos.ksh \ functional/cli_root/zpool_add/zpool_add_006_pos.ksh \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \ functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \ functional/cli_root/zpool_create/cleanup.ksh \ functional/cli_root/zpool_create/create-o_ashift.ksh \ functional/cli_root/zpool_create/setup.ksh \ functional/cli_root/zpool_create/zpool_create_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_007_neg.ksh \ functional/cli_root/zpool_create/zpool_create_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_009_neg.ksh \ functional/cli_root/zpool_create/zpool_create_010_neg.ksh \ functional/cli_root/zpool_create/zpool_create_011_neg.ksh \ functional/cli_root/zpool_create/zpool_create_012_neg.ksh \ functional/cli_root/zpool_create/zpool_create_014_neg.ksh \ functional/cli_root/zpool_create/zpool_create_015_neg.ksh \ functional/cli_root/zpool_create/zpool_create_016_pos.ksh \ functional/cli_root/zpool_create/zpool_create_017_neg.ksh \ functional/cli_root/zpool_create/zpool_create_018_pos.ksh \ functional/cli_root/zpool_create/zpool_create_019_pos.ksh \ functional/cli_root/zpool_create/zpool_create_020_pos.ksh \ functional/cli_root/zpool_create/zpool_create_021_pos.ksh \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \ functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \ functional/cli_root/zpool_create/zpool_create_tempname.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \ functional/cli_root/zpool_detach/cleanup.ksh \ functional/cli_root/zpool_detach/setup.ksh \ functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \ functional/cli_root/zpool_events/cleanup.ksh \ functional/cli_root/zpool_events/setup.ksh \ functional/cli_root/zpool_events/zpool_events_clear.ksh \ functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \ functional/cli_root/zpool_events/zpool_events_cliargs.ksh \ functional/cli_root/zpool_events/zpool_events_duplicates.ksh \ functional/cli_root/zpool_events/zpool_events_errors.ksh \ functional/cli_root/zpool_events/zpool_events_follow.ksh \ functional/cli_root/zpool_events/zpool_events_poolname.ksh \ functional/cli_root/zpool_expand/cleanup.ksh \ functional/cli_root/zpool_expand/setup.ksh \ functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \ functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \ functional/cli_root/zpool_export/cleanup.ksh \ functional/cli_root/zpool_export/setup.ksh \ functional/cli_root/zpool_export/zpool_export_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ functional/cli_root/zpool_get/vdev_get_all.ksh \ functional/cli_root/zpool_get/zpool_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_002_pos.ksh \ functional/cli_root/zpool_get/zpool_get_003_pos.ksh \ functional/cli_root/zpool_get/zpool_get_004_neg.ksh \ functional/cli_root/zpool_get/zpool_get_005_pos.ksh \ functional/cli_root/zpool_history/cleanup.ksh \ functional/cli_root/zpool_history/setup.ksh \ functional/cli_root/zpool_history/zpool_history_001_neg.ksh \ functional/cli_root/zpool_history/zpool_history_002_pos.ksh \ functional/cli_root/zpool_import/cleanup.ksh \ functional/cli_root/zpool_import/import_cachefile_device_added.ksh \ functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \ functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ functional/cli_root/zpool_import/setup.ksh \ functional/cli_root/zpool_import/zpool_import_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_004_pos.ksh \ functional/cli_root/zpool_import/zpool_import_005_pos.ksh \ functional/cli_root/zpool_import/zpool_import_006_pos.ksh \ functional/cli_root/zpool_import/zpool_import_007_pos.ksh \ functional/cli_root/zpool_import/zpool_import_008_pos.ksh \ functional/cli_root/zpool_import/zpool_import_009_neg.ksh \ functional/cli_root/zpool_import/zpool_import_010_pos.ksh \ functional/cli_root/zpool_import/zpool_import_011_neg.ksh \ functional/cli_root/zpool_import/zpool_import_012_pos.ksh \ functional/cli_root/zpool_import/zpool_import_013_neg.ksh \ functional/cli_root/zpool_import/zpool_import_014_pos.ksh \ functional/cli_root/zpool_import/zpool_import_015_pos.ksh \ functional/cli_root/zpool_import/zpool_import_016_pos.ksh \ functional/cli_root/zpool_import/zpool_import_017_pos.ksh \ functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \ functional/cli_root/zpool_import/zpool_import_errata3.ksh \ functional/cli_root/zpool_import/zpool_import_errata4.ksh \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_status.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \ functional/cli_root/zpool_offline/cleanup.ksh \ functional/cli_root/zpool_offline/setup.ksh \ functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \ functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \ functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \ functional/cli_root/zpool_online/cleanup.ksh \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ functional/cli_root/zpool_prefetch/cleanup.ksh \ functional/cli_root/zpool_prefetch/setup.ksh \ functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \ functional/cli_root/zpool_reguid/cleanup.ksh \ functional/cli_root/zpool_reguid/setup.ksh \ functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \ functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \ functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \ functional/cli_root/zpool_reopen/cleanup.ksh \ functional/cli_root/zpool_reopen/setup.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \ functional/cli_root/zpool_replace/cleanup.ksh \ functional/cli_root/zpool_replace/replace-o_ashift.ksh \ functional/cli_root/zpool_replace/replace_prop_ashift.ksh \ functional/cli_root/zpool_replace/setup.ksh \ functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \ functional/cli_root/zpool_resilver/cleanup.ksh \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ functional/cli_root/zpool_split/zpool_split_cliargs.ksh \ functional/cli_root/zpool_split/zpool_split_devices.ksh \ functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \ functional/cli_root/zpool_split/zpool_split_encryption.ksh \ functional/cli_root/zpool_split/zpool_split_indirect.ksh \ functional/cli_root/zpool_split/zpool_split_props.ksh \ functional/cli_root/zpool_split/zpool_split_resilver.ksh \ functional/cli_root/zpool_split/zpool_split_vdevs.ksh \ functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \ functional/cli_root/zpool_status/cleanup.ksh \ functional/cli_root/zpool_status/setup.ksh \ functional/cli_root/zpool_status/zpool_status_001_pos.ksh \ functional/cli_root/zpool_status/zpool_status_002_pos.ksh \ functional/cli_root/zpool_status/zpool_status_003_pos.ksh \ functional/cli_root/zpool_status/zpool_status_004_pos.ksh \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \ functional/cli_root/zpool_trim/cleanup.ksh \ functional/cli_root/zpool_trim/setup.ksh \ functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \ functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \ functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \ functional/cli_root/zpool_trim/zpool_trim_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \ functional/cli_root/zpool_trim/zpool_trim_partial.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_secure.ksh \ functional/cli_root/zpool_trim/zpool_trim_split.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \ functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \ functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \ functional/cli_root/zpool_upgrade/cleanup.ksh \ functional/cli_root/zpool_upgrade/setup.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \ functional/cli_root/zpool_wait/cleanup.ksh \ functional/cli_root/zpool_wait/scan/cleanup.ksh \ functional/cli_root/zpool_wait/scan/setup.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \ functional/cli_root/zpool_wait/setup.ksh \ functional/cli_root/zpool_wait/zpool_wait_discard.ksh \ functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \ functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_usage.ksh \ functional/cli_root/zpool/zpool_001_neg.ksh \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ functional/cli_user/misc/arcstat_001_pos.ksh \ functional/cli_user/misc/arc_summary_001_pos.ksh \ functional/cli_user/misc/arc_summary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ functional/cli_user/misc/zdb_001_neg.ksh \ functional/cli_user/misc/zfs_001_neg.ksh \ functional/cli_user/misc/zfs_allow_001_neg.ksh \ functional/cli_user/misc/zfs_clone_001_neg.ksh \ functional/cli_user/misc/zfs_create_001_neg.ksh \ functional/cli_user/misc/zfs_destroy_001_neg.ksh \ functional/cli_user/misc/zfs_get_001_neg.ksh \ functional/cli_user/misc/zfs_inherit_001_neg.ksh \ functional/cli_user/misc/zfs_mount_001_neg.ksh \ functional/cli_user/misc/zfs_promote_001_neg.ksh \ functional/cli_user/misc/zfs_receive_001_neg.ksh \ functional/cli_user/misc/zfs_rename_001_neg.ksh \ functional/cli_user/misc/zfs_rollback_001_neg.ksh \ functional/cli_user/misc/zfs_send_001_neg.ksh \ functional/cli_user/misc/zfs_set_001_neg.ksh \ functional/cli_user/misc/zfs_share_001_neg.ksh \ functional/cli_user/misc/zfs_snapshot_001_neg.ksh \ functional/cli_user/misc/zfs_unallow_001_neg.ksh \ functional/cli_user/misc/zfs_unmount_001_neg.ksh \ functional/cli_user/misc/zfs_unshare_001_neg.ksh \ functional/cli_user/misc/zfs_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_001_neg.ksh \ functional/cli_user/misc/zpool_add_001_neg.ksh \ functional/cli_user/misc/zpool_attach_001_neg.ksh \ functional/cli_user/misc/zpool_clear_001_neg.ksh \ functional/cli_user/misc/zpool_create_001_neg.ksh \ functional/cli_user/misc/zpool_destroy_001_neg.ksh \ functional/cli_user/misc/zpool_detach_001_neg.ksh \ functional/cli_user/misc/zpool_export_001_neg.ksh \ functional/cli_user/misc/zpool_get_001_neg.ksh \ functional/cli_user/misc/zpool_history_001_neg.ksh \ functional/cli_user/misc/zpool_import_001_neg.ksh \ functional/cli_user/misc/zpool_import_002_neg.ksh \ functional/cli_user/misc/zpool_offline_001_neg.ksh \ functional/cli_user/misc/zpool_online_001_neg.ksh \ functional/cli_user/misc/zpool_remove_001_neg.ksh \ functional/cli_user/misc/zpool_replace_001_neg.ksh \ functional/cli_user/misc/zpool_scrub_001_neg.ksh \ functional/cli_user/misc/zpool_set_001_neg.ksh \ functional/cli_user/misc/zpool_status_001_neg.ksh \ functional/cli_user/misc/zpool_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_wait_privilege.ksh \ functional/cli_user/zfs_list/cleanup.ksh \ functional/cli_user/zfs_list/setup.ksh \ functional/cli_user/zfs_list/zfs_list_001_pos.ksh \ functional/cli_user/zfs_list/zfs_list_002_pos.ksh \ functional/cli_user/zfs_list/zfs_list_003_pos.ksh \ functional/cli_user/zfs_list/zfs_list_004_neg.ksh \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \ functional/cli_user/zpool_list/cleanup.ksh \ functional/cli_user/zpool_list/setup.ksh \ functional/cli_user/zpool_list/zpool_list_001_pos.ksh \ functional/cli_user/zpool_list/zpool_list_002_neg.ksh \ functional/cli_user/zpool_status/cleanup.ksh \ functional/cli_user/zpool_status/setup.ksh \ functional/cli_user/zpool_status/zpool_status_003_pos.ksh \ functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \ functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \ functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \ functional/compression/cleanup.ksh \ functional/compression/compress_001_pos.ksh \ functional/compression/compress_002_pos.ksh \ functional/compression/compress_003_pos.ksh \ functional/compression/compress_004_pos.ksh \ functional/compression/compress_zstd_bswap.ksh \ functional/compression/l2arc_compressed_arc_disabled.ksh \ functional/compression/l2arc_compressed_arc.ksh \ functional/compression/l2arc_encrypted.ksh \ functional/compression/l2arc_encrypted_no_compressed_arc.ksh \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ functional/crtime/crtime_001_pos.ksh \ functional/crtime/setup.ksh \ functional/crypto/icp_aes_ccm.ksh \ functional/crypto/icp_aes_gcm.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ functional/dedup/cleanup.ksh \ functional/dedup/setup.ksh \ functional/dedup/dedup_fdt_create.ksh \ functional/dedup/dedup_fdt_import.ksh \ functional/dedup/dedup_fdt_pacing.ksh \ functional/dedup/dedup_legacy_create.ksh \ functional/dedup/dedup_legacy_import.ksh \ functional/dedup/dedup_legacy_fdt_upgrade.ksh \ functional/dedup/dedup_legacy_fdt_mixed.ksh \ functional/dedup/dedup_prune.ksh \ functional/dedup/dedup_quota.ksh \ functional/dedup/dedup_zap_shrink.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ functional/delegate/zfs_allow_001_pos.ksh \ functional/delegate/zfs_allow_002_pos.ksh \ functional/delegate/zfs_allow_003_pos.ksh \ functional/delegate/zfs_allow_004_pos.ksh \ functional/delegate/zfs_allow_005_pos.ksh \ functional/delegate/zfs_allow_006_pos.ksh \ functional/delegate/zfs_allow_007_pos.ksh \ functional/delegate/zfs_allow_008_pos.ksh \ functional/delegate/zfs_allow_009_neg.ksh \ functional/delegate/zfs_allow_010_pos.ksh \ functional/delegate/zfs_allow_011_neg.ksh \ functional/delegate/zfs_allow_012_neg.ksh \ functional/delegate/zfs_unallow_001_pos.ksh \ functional/delegate/zfs_unallow_002_pos.ksh \ functional/delegate/zfs_unallow_003_pos.ksh \ functional/delegate/zfs_unallow_004_pos.ksh \ functional/delegate/zfs_unallow_005_pos.ksh \ functional/delegate/zfs_unallow_006_pos.ksh \ functional/delegate/zfs_unallow_007_neg.ksh \ functional/delegate/zfs_unallow_008_neg.ksh \ functional/devices/cleanup.ksh \ functional/devices/devices_001_pos.ksh \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ functional/direct/dio_aligned_block.ksh \ functional/direct/dio_async_always.ksh \ functional/direct/dio_async_fio_ioengines.ksh \ functional/direct/dio_compression.ksh \ functional/direct/dio_dedup.ksh \ functional/direct/dio_encryption.ksh \ functional/direct/dio_grow_block.ksh \ functional/direct/dio_loopback_dev.ksh \ functional/direct/dio_max_recordsize.ksh \ functional/direct/dio_mixed.ksh \ functional/direct/dio_mmap.ksh \ functional/direct/dio_overwrites.ksh \ functional/direct/dio_property.ksh \ functional/direct/dio_random.ksh \ functional/direct/dio_read_verify.ksh \ functional/direct/dio_recordsize.ksh \ functional/direct/dio_unaligned_block.ksh \ functional/direct/dio_unaligned_filesize.ksh \ functional/direct/dio_write_verify.ksh \ functional/direct/dio_write_stable_pages.ksh \ functional/direct/setup.ksh \ functional/direct/cleanup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ functional/dos_attributes/write_dos_attrs_001.ksh \ functional/events/cleanup.ksh \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ functional/events/zed_slow_io.ksh \ functional/events/zed_slow_io_many_vdevs.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ functional/fadvise/cleanup.ksh \ functional/fadvise/fadvise_sequential.ksh \ functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ functional/fallocate/fallocate_zero-range.ksh \ functional/fallocate/setup.ksh \ functional/fault/auto_offline_001_pos.ksh \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ functional/fault/auto_spare_double.ksh \ functional/fault/auto_spare_multiple.ksh \ functional/fault/auto_spare_shared.ksh \ functional/fault/cleanup.ksh \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ functional/fault/fault_limits.ksh \ functional/fault/scrub_after_resilver.ksh \ functional/fault/suspend_on_probe_errors.ksh \ functional/fault/suspend_resume_single.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ functional/features/async_destroy/cleanup.ksh \ functional/features/async_destroy/setup.ksh \ functional/features/large_dnode/cleanup.ksh \ functional/features/large_dnode/large_dnode_001_pos.ksh \ functional/features/large_dnode/large_dnode_002_pos.ksh \ functional/features/large_dnode/large_dnode_003_pos.ksh \ functional/features/large_dnode/large_dnode_004_neg.ksh \ functional/features/large_dnode/large_dnode_005_pos.ksh \ functional/features/large_dnode/large_dnode_006_pos.ksh \ functional/features/large_dnode/large_dnode_007_neg.ksh \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/gang_blocks/cleanup.ksh \ + functional/gang_blocks/gang_blocks_001_pos.ksh \ functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ functional/history/history_001_pos.ksh \ functional/history/history_002_pos.ksh \ functional/history/history_003_pos.ksh \ functional/history/history_004_pos.ksh \ functional/history/history_005_neg.ksh \ functional/history/history_006_neg.ksh \ functional/history/history_007_pos.ksh \ functional/history/history_008_pos.ksh \ functional/history/history_009_pos.ksh \ functional/history/history_010_pos.ksh \ functional/history/setup.ksh \ functional/inheritance/cleanup.ksh \ functional/inheritance/inherit_001_pos.ksh \ functional/inuse/inuse_001_pos.ksh \ functional/inuse/inuse_003_pos.ksh \ functional/inuse/inuse_004_pos.ksh \ functional/inuse/inuse_005_pos.ksh \ functional/inuse/inuse_006_pos.ksh \ functional/inuse/inuse_007_pos.ksh \ functional/inuse/inuse_008_pos.ksh \ functional/inuse/inuse_009_pos.ksh \ functional/inuse/setup.ksh \ functional/io/cleanup.ksh \ functional/io/io_uring.ksh \ functional/io/libaio.ksh \ functional/io/mmap.ksh \ functional/io/posixaio.ksh \ functional/io/psync.ksh \ functional/io/setup.ksh \ functional/io/sync.ksh \ functional/l2arc/cleanup.ksh \ functional/l2arc/l2arc_arcstats_pos.ksh \ functional/l2arc/l2arc_l2miss_pos.ksh \ functional/l2arc/l2arc_mfuonly_pos.ksh \ functional/l2arc/persist_l2arc_001_pos.ksh \ functional/l2arc/persist_l2arc_002_pos.ksh \ functional/l2arc/persist_l2arc_003_neg.ksh \ functional/l2arc/persist_l2arc_004_pos.ksh \ functional/l2arc/persist_l2arc_005_pos.ksh \ functional/l2arc/setup.ksh \ functional/large_files/cleanup.ksh \ functional/large_files/large_files_001_pos.ksh \ functional/large_files/large_files_002_pos.ksh \ functional/large_files/setup.ksh \ functional/largest_pool/largest_pool_001_pos.ksh \ functional/libzfs/cleanup.ksh \ functional/libzfs/libzfs_input.ksh \ functional/libzfs/setup.ksh \ functional/limits/cleanup.ksh \ functional/limits/filesystem_count.ksh \ functional/limits/filesystem_limit.ksh \ functional/limits/setup.ksh \ functional/limits/snapshot_count.ksh \ functional/limits/snapshot_limit.ksh \ functional/link_count/cleanup.ksh \ functional/link_count/link_count_001.ksh \ functional/link_count/link_count_root_inode.ksh \ functional/link_count/setup.ksh \ functional/longname/cleanup.ksh \ functional/longname/longname_001_pos.ksh \ functional/longname/longname_002_pos.ksh \ functional/longname/longname_003_pos.ksh \ functional/longname/setup.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ functional/migration/migration_002_pos.ksh \ functional/migration/migration_003_pos.ksh \ functional/migration/migration_004_pos.ksh \ functional/migration/migration_005_pos.ksh \ functional/migration/migration_006_pos.ksh \ functional/migration/migration_007_pos.ksh \ functional/migration/migration_008_pos.ksh \ functional/migration/migration_009_pos.ksh \ functional/migration/migration_010_pos.ksh \ functional/migration/migration_011_pos.ksh \ functional/migration/migration_012_pos.ksh \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ functional/mmp/mmp_exported_import.ksh \ functional/mmp/mmp_hostid.ksh \ functional/mmp/mmp_inactive_import.ksh \ functional/mmp/mmp_interval.ksh \ functional/mmp/mmp_on_off.ksh \ functional/mmp/mmp_on_thread.ksh \ functional/mmp/mmp_on_uberblocks.ksh \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ functional/mmp/mmp_write_slow_disk.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ functional/mount/cleanup.ksh \ functional/mount/setup.ksh \ functional/mount/umount_001.ksh \ functional/mount/umountall_001.ksh \ functional/mount/umount_unlinked_drain.ksh \ functional/mv_files/cleanup.ksh \ functional/mv_files/mv_files_001_pos.ksh \ functional/mv_files/mv_files_002_pos.ksh \ functional/mv_files/random_creation.ksh \ functional/mv_files/setup.ksh \ functional/nestedfs/cleanup.ksh \ functional/nestedfs/nestedfs_001_pos.ksh \ functional/nestedfs/setup.ksh \ functional/nopwrite/cleanup.ksh \ functional/nopwrite/nopwrite_copies.ksh \ functional/nopwrite/nopwrite_mtime.ksh \ functional/nopwrite/nopwrite_negative.ksh \ functional/nopwrite/nopwrite_promoted_clone.ksh \ functional/nopwrite/nopwrite_recsize.ksh \ functional/nopwrite/nopwrite_sync.ksh \ functional/nopwrite/nopwrite_varying_compression.ksh \ functional/nopwrite/nopwrite_volume.ksh \ functional/nopwrite/setup.ksh \ functional/no_space/cleanup.ksh \ functional/no_space/enospc_001_pos.ksh \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ functional/online_offline/online_offline_001_pos.ksh \ functional/online_offline/online_offline_002_neg.ksh \ functional/online_offline/online_offline_003_neg.ksh \ functional/online_offline/setup.ksh \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ functional/pam/pam_change_unmounted.ksh \ functional/pam/pam_mount_recursively.ksh \ functional/pam/pam_nounmount.ksh \ functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ functional/pam/setup.ksh \ functional/pool_checkpoint/checkpoint_after_rewind.ksh \ functional/pool_checkpoint/checkpoint_big_rewind.ksh \ functional/pool_checkpoint/checkpoint_capacity.ksh \ functional/pool_checkpoint/checkpoint_conf_change.ksh \ functional/pool_checkpoint/checkpoint_discard_busy.ksh \ functional/pool_checkpoint/checkpoint_discard.ksh \ functional/pool_checkpoint/checkpoint_discard_many.ksh \ functional/pool_checkpoint/checkpoint_indirect.ksh \ functional/pool_checkpoint/checkpoint_invalid.ksh \ functional/pool_checkpoint/checkpoint_lun_expsz.ksh \ functional/pool_checkpoint/checkpoint_open.ksh \ functional/pool_checkpoint/checkpoint_removal.ksh \ functional/pool_checkpoint/checkpoint_rewind.ksh \ functional/pool_checkpoint/checkpoint_ro_rewind.ksh \ functional/pool_checkpoint/checkpoint_sm_scale.ksh \ functional/pool_checkpoint/checkpoint_twice.ksh \ functional/pool_checkpoint/checkpoint_vdev_add.ksh \ functional/pool_checkpoint/checkpoint_zdb.ksh \ functional/pool_checkpoint/checkpoint_zhack_feat.ksh \ functional/pool_checkpoint/cleanup.ksh \ functional/pool_checkpoint/setup.ksh \ functional/pool_names/pool_names_001_pos.ksh \ functional/pool_names/pool_names_002_neg.ksh \ functional/poolversion/cleanup.ksh \ functional/poolversion/poolversion_001_pos.ksh \ functional/poolversion/poolversion_002_pos.ksh \ functional/poolversion/setup.ksh \ functional/privilege/cleanup.ksh \ functional/privilege/privilege_001_pos.ksh \ functional/privilege/privilege_002_pos.ksh \ functional/privilege/setup.ksh \ functional/procfs/cleanup.ksh \ functional/procfs/pool_state.ksh \ functional/procfs/procfs_list_basic.ksh \ functional/procfs/procfs_list_concurrent_readers.ksh \ functional/procfs/procfs_list_stale_read.ksh \ functional/procfs/setup.ksh \ functional/projectquota/cleanup.ksh \ functional/projectquota/projectid_001_pos.ksh \ functional/projectquota/projectid_002_pos.ksh \ functional/projectquota/projectid_003_pos.ksh \ functional/projectquota/projectquota_001_pos.ksh \ functional/projectquota/projectquota_002_pos.ksh \ functional/projectquota/projectquota_003_pos.ksh \ functional/projectquota/projectquota_004_neg.ksh \ functional/projectquota/projectquota_005_pos.ksh \ functional/projectquota/projectquota_006_pos.ksh \ functional/projectquota/projectquota_007_pos.ksh \ functional/projectquota/projectquota_008_pos.ksh \ functional/projectquota/projectquota_009_pos.ksh \ functional/projectquota/defaultprojectquota_001_pos.ksh \ functional/projectquota/defaultprojectquota_002_pos.ksh \ functional/projectquota/defaultprojectquota_003_neg.ksh \ functional/projectquota/defaultprojectquota_004_pos.ksh \ functional/projectquota/defaultprojectquota_005_pos.ksh \ functional/projectquota/defaultprojectquota_006_pos.ksh \ functional/projectquota/defaultprojectquota_007_pos.ksh \ functional/projectquota/projectspace_001_pos.ksh \ functional/projectquota/projectspace_002_pos.ksh \ functional/projectquota/projectspace_003_pos.ksh \ functional/projectquota/projectspace_004_pos.ksh \ functional/projectquota/projectspace_005_pos.ksh \ functional/projectquota/projecttree_001_pos.ksh \ functional/projectquota/projecttree_002_pos.ksh \ functional/projectquota/projecttree_003_neg.ksh \ functional/projectquota/setup.ksh \ functional/quota/cleanup.ksh \ functional/quota/quota_001_pos.ksh \ functional/quota/quota_002_pos.ksh \ functional/quota/quota_003_pos.ksh \ functional/quota/quota_004_pos.ksh \ functional/quota/quota_005_pos.ksh \ functional/quota/quota_006_neg.ksh \ functional/quota/setup.ksh \ functional/raidz/cleanup.ksh \ functional/raidz/raidz_001_neg.ksh \ functional/raidz/raidz_002_pos.ksh \ functional/raidz/raidz_expand_001_pos.ksh \ functional/raidz/raidz_expand_002_pos.ksh \ functional/raidz/raidz_expand_003_neg.ksh \ functional/raidz/raidz_expand_003_pos.ksh \ functional/raidz/raidz_expand_004_pos.ksh \ functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/raidz_expand_006_neg.ksh \ functional/raidz/raidz_expand_007_neg.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ functional/redacted_send/redacted_contents.ksh \ functional/redacted_send/redacted_deleted.ksh \ functional/redacted_send/redacted_disabled_feature.ksh \ functional/redacted_send/redacted_embedded.ksh \ functional/redacted_send/redacted_holes.ksh \ functional/redacted_send/redacted_incrementals.ksh \ functional/redacted_send/redacted_largeblocks.ksh \ functional/redacted_send/redacted_many_clones.ksh \ functional/redacted_send/redacted_mixed_recsize.ksh \ functional/redacted_send/redacted_mounts.ksh \ functional/redacted_send/redacted_negative.ksh \ functional/redacted_send/redacted_origin.ksh \ functional/redacted_send/redacted_panic.ksh \ functional/redacted_send/redacted_props.ksh \ functional/redacted_send/redacted_resume.ksh \ functional/redacted_send/redacted_size.ksh \ functional/redacted_send/redacted_volume.ksh \ functional/redacted_send/setup.ksh \ functional/redundancy/cleanup.ksh \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ functional/redundancy/redundancy_raidz3.ksh \ functional/redundancy/redundancy_raidz.ksh \ functional/redundancy/redundancy_stripe.ksh \ functional/redundancy/setup.ksh \ functional/refquota/cleanup.ksh \ functional/refquota/refquota_001_pos.ksh \ functional/refquota/refquota_002_pos.ksh \ functional/refquota/refquota_003_pos.ksh \ functional/refquota/refquota_004_pos.ksh \ functional/refquota/refquota_005_pos.ksh \ functional/refquota/refquota_006_neg.ksh \ functional/refquota/refquota_007_neg.ksh \ functional/refquota/refquota_008_neg.ksh \ functional/refquota/setup.ksh \ functional/refreserv/cleanup.ksh \ functional/refreserv/refreserv_001_pos.ksh \ functional/refreserv/refreserv_002_pos.ksh \ functional/refreserv/refreserv_003_pos.ksh \ functional/refreserv/refreserv_004_pos.ksh \ functional/refreserv/refreserv_005_pos.ksh \ functional/refreserv/refreserv_multi_raidz.ksh \ functional/refreserv/refreserv_raidz.ksh \ functional/refreserv/setup.ksh \ functional/removal/cleanup.ksh \ functional/removal/removal_all_vdev.ksh \ functional/removal/removal_cancel.ksh \ functional/removal/removal_check_space.ksh \ functional/removal/removal_condense_export.ksh \ functional/removal/removal_multiple_indirection.ksh \ functional/removal/removal_nopwrite.ksh \ functional/removal/removal_remap_deadlists.ksh \ functional/removal/removal_reservation.ksh \ functional/removal/removal_resume_export.ksh \ functional/removal/removal_sanity.ksh \ functional/removal/removal_with_add.ksh \ functional/removal/removal_with_create_fs.ksh \ functional/removal/removal_with_dedup.ksh \ functional/removal/removal_with_errors.ksh \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ functional/removal/removal_with_hole.ksh \ functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ functional/removal/removal_with_send.ksh \ functional/removal/removal_with_send_recv.ksh \ functional/removal/removal_with_snapshot.ksh \ functional/removal/removal_with_write.ksh \ functional/removal/removal_with_zdb.ksh \ functional/removal/remove_attach_mirror.ksh \ functional/removal/remove_expanded.ksh \ functional/removal/remove_indirect.ksh \ functional/removal/remove_mirror.ksh \ functional/removal/remove_mirror_sanity.ksh \ functional/removal/remove_raidz.ksh \ functional/rename_dirs/cleanup.ksh \ functional/rename_dirs/rename_dirs_001_pos.ksh \ functional/rename_dirs/setup.ksh \ functional/renameat2/cleanup.ksh \ functional/renameat2/setup.ksh \ functional/renameat2/renameat2_exchange.ksh \ functional/renameat2/renameat2_noreplace.ksh \ functional/renameat2/renameat2_whiteout.ksh \ functional/replacement/attach_import.ksh \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ functional/replacement/rebuild_multiple.ksh \ functional/replacement/rebuild_raidz.ksh \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ functional/replacement/setup.ksh \ functional/reservation/cleanup.ksh \ functional/reservation/reservation_001_pos.ksh \ functional/reservation/reservation_002_pos.ksh \ functional/reservation/reservation_003_pos.ksh \ functional/reservation/reservation_004_pos.ksh \ functional/reservation/reservation_005_pos.ksh \ functional/reservation/reservation_006_pos.ksh \ functional/reservation/reservation_007_pos.ksh \ functional/reservation/reservation_008_pos.ksh \ functional/reservation/reservation_009_pos.ksh \ functional/reservation/reservation_010_pos.ksh \ functional/reservation/reservation_011_pos.ksh \ functional/reservation/reservation_012_pos.ksh \ functional/reservation/reservation_013_pos.ksh \ functional/reservation/reservation_014_pos.ksh \ functional/reservation/reservation_015_pos.ksh \ functional/reservation/reservation_016_pos.ksh \ functional/reservation/reservation_017_pos.ksh \ functional/reservation/reservation_018_pos.ksh \ functional/reservation/reservation_019_pos.ksh \ functional/reservation/reservation_020_pos.ksh \ functional/reservation/reservation_021_neg.ksh \ functional/reservation/reservation_022_pos.ksh \ functional/reservation/setup.ksh \ functional/rootpool/cleanup.ksh \ functional/rootpool/rootpool_002_neg.ksh \ functional/rootpool/rootpool_003_neg.ksh \ functional/rootpool/rootpool_007_pos.ksh \ functional/rootpool/setup.ksh \ functional/rsend/cleanup.ksh \ functional/rsend/recv_dedup_encrypted_zvol.ksh \ functional/rsend/recv_dedup.ksh \ functional/rsend/rsend_001_pos.ksh \ functional/rsend/rsend_002_pos.ksh \ functional/rsend/rsend_003_pos.ksh \ functional/rsend/rsend_004_pos.ksh \ functional/rsend/rsend_005_pos.ksh \ functional/rsend/rsend_006_pos.ksh \ functional/rsend/rsend_007_pos.ksh \ functional/rsend/rsend_008_pos.ksh \ functional/rsend/rsend_009_pos.ksh \ functional/rsend/rsend_010_pos.ksh \ functional/rsend/rsend_011_pos.ksh \ functional/rsend/rsend_012_pos.ksh \ functional/rsend/rsend_013_pos.ksh \ functional/rsend/rsend_014_pos.ksh \ functional/rsend/rsend_016_neg.ksh \ functional/rsend/rsend_019_pos.ksh \ functional/rsend/rsend_020_pos.ksh \ functional/rsend/rsend_021_pos.ksh \ functional/rsend/rsend_022_pos.ksh \ functional/rsend/rsend_024_pos.ksh \ functional/rsend/rsend_025_pos.ksh \ functional/rsend/rsend_026_neg.ksh \ functional/rsend/rsend_027_pos.ksh \ functional/rsend/rsend_028_neg.ksh \ functional/rsend/rsend_029_neg.ksh \ functional/rsend/rsend_030_pos.ksh \ functional/rsend/rsend_031_pos.ksh \ functional/rsend/send-c_embedded_blocks.ksh \ functional/rsend/send-c_incremental.ksh \ functional/rsend/send-c_longname.ksh \ functional/rsend/send-c_lz4_disabled.ksh \ functional/rsend/send-c_mixed_compression.ksh \ functional/rsend/send-c_props.ksh \ functional/rsend/send-c_recv_dedup.ksh \ functional/rsend/send-c_recv_lz4_disabled.ksh \ functional/rsend/send-c_resume.ksh \ functional/rsend/send-c_stream_size_estimate.ksh \ functional/rsend/send-c_verify_contents.ksh \ functional/rsend/send-c_verify_ratio.ksh \ functional/rsend/send-c_volume.ksh \ functional/rsend/send-c_zstream_recompress.ksh \ functional/rsend/send-c_zstreamdump.ksh \ functional/rsend/send-cpL_varied_recsize.ksh \ functional/rsend/send_doall.ksh \ functional/rsend/send_encrypted_incremental.ksh \ functional/rsend/send_encrypted_files.ksh \ functional/rsend/send_encrypted_freeobjects.ksh \ functional/rsend/send_encrypted_hierarchy.ksh \ functional/rsend/send_encrypted_props.ksh \ functional/rsend/send_encrypted_truncated_files.ksh \ functional/rsend/send_freeobjects.ksh \ functional/rsend/send_holds.ksh \ functional/rsend/send_hole_birth.ksh \ functional/rsend/send_invalid.ksh \ functional/rsend/send-L_toggle.ksh \ functional/rsend/send_mixed_raw.ksh \ functional/rsend/send_partial_dataset.ksh \ functional/rsend/send_raw_ashift.ksh \ functional/rsend/send_raw_spill_block.ksh \ functional/rsend/send_raw_large_blocks.ksh \ functional/rsend/send_realloc_dnode_size.ksh \ functional/rsend/send_realloc_encrypted_files.ksh \ functional/rsend/send_realloc_files.ksh \ functional/rsend/send_spill_block.ksh \ functional/rsend/send-wR_encrypted_zvol.ksh \ functional/rsend/setup.ksh \ functional/scrub_mirror/cleanup.ksh \ functional/scrub_mirror/scrub_mirror_001_pos.ksh \ functional/scrub_mirror/scrub_mirror_002_pos.ksh \ functional/scrub_mirror/scrub_mirror_003_pos.ksh \ functional/scrub_mirror/scrub_mirror_004_pos.ksh \ functional/scrub_mirror/setup.ksh \ functional/slog/cleanup.ksh \ functional/slog/setup.ksh \ functional/slog/slog_001_pos.ksh \ functional/slog/slog_002_pos.ksh \ functional/slog/slog_003_pos.ksh \ functional/slog/slog_004_pos.ksh \ functional/slog/slog_005_pos.ksh \ functional/slog/slog_006_pos.ksh \ functional/slog/slog_007_pos.ksh \ functional/slog/slog_008_neg.ksh \ functional/slog/slog_009_neg.ksh \ functional/slog/slog_010_neg.ksh \ functional/slog/slog_011_neg.ksh \ functional/slog/slog_012_neg.ksh \ functional/slog/slog_013_pos.ksh \ functional/slog/slog_014_pos.ksh \ functional/slog/slog_015_neg.ksh \ functional/slog/slog_016_pos.ksh \ functional/slog/slog_replay_fs_001.ksh \ functional/slog/slog_replay_fs_002.ksh \ functional/slog/slog_replay_volume.ksh \ functional/snapshot/cleanup.ksh \ functional/snapshot/clone_001_pos.ksh \ functional/snapshot/rollback_001_pos.ksh \ functional/snapshot/rollback_002_pos.ksh \ functional/snapshot/rollback_003_pos.ksh \ functional/snapshot/setup.ksh \ functional/snapshot/snapshot_001_pos.ksh \ functional/snapshot/snapshot_002_pos.ksh \ functional/snapshot/snapshot_003_pos.ksh \ functional/snapshot/snapshot_004_pos.ksh \ functional/snapshot/snapshot_005_pos.ksh \ functional/snapshot/snapshot_006_pos.ksh \ functional/snapshot/snapshot_007_pos.ksh \ functional/snapshot/snapshot_008_pos.ksh \ functional/snapshot/snapshot_009_pos.ksh \ functional/snapshot/snapshot_010_pos.ksh \ functional/snapshot/snapshot_011_pos.ksh \ functional/snapshot/snapshot_012_pos.ksh \ functional/snapshot/snapshot_013_pos.ksh \ functional/snapshot/snapshot_014_pos.ksh \ functional/snapshot/snapshot_015_pos.ksh \ functional/snapshot/snapshot_016_pos.ksh \ functional/snapshot/snapshot_017_pos.ksh \ functional/snapshot/snapshot_018_pos.ksh \ functional/snapused/cleanup.ksh \ functional/snapused/setup.ksh \ functional/snapused/snapused_001_pos.ksh \ functional/snapused/snapused_002_pos.ksh \ functional/snapused/snapused_003_pos.ksh \ functional/snapused/snapused_004_pos.ksh \ functional/snapused/snapused_005_pos.ksh \ functional/sparse/cleanup.ksh \ functional/sparse/setup.ksh \ functional/sparse/sparse_001_pos.ksh \ functional/stat/cleanup.ksh \ functional/stat/setup.ksh \ functional/stat/stat_001_pos.ksh \ functional/stat/statx_dioalign.ksh \ functional/suid/cleanup.ksh \ functional/suid/setup.ksh \ functional/suid/suid_write_to_none.ksh \ functional/suid/suid_write_to_sgid.ksh \ functional/suid/suid_write_to_suid.ksh \ functional/suid/suid_write_to_suid_sgid.ksh \ functional/suid/suid_write_zil_replay.ksh \ functional/trim/autotrim_config.ksh \ functional/trim/autotrim_integrity.ksh \ functional/trim/autotrim_trim_integrity.ksh \ functional/trim/cleanup.ksh \ functional/trim/setup.ksh \ functional/trim/trim_config.ksh \ functional/trim/trim_integrity.ksh \ functional/trim/trim_l2arc.ksh \ functional/truncate/cleanup.ksh \ functional/truncate/setup.ksh \ functional/truncate/truncate_001_pos.ksh \ functional/truncate/truncate_002_pos.ksh \ functional/truncate/truncate_timestamps.ksh \ functional/upgrade/cleanup.ksh \ functional/upgrade/setup.ksh \ functional/upgrade/upgrade_projectquota_001_pos.ksh \ functional/upgrade/upgrade_projectquota_002_pos.ksh \ functional/upgrade/upgrade_readonly_pool.ksh \ functional/upgrade/upgrade_userobj_001_pos.ksh \ functional/user_namespace/cleanup.ksh \ functional/user_namespace/setup.ksh \ functional/user_namespace/user_namespace_001.ksh \ functional/user_namespace/user_namespace_002.ksh \ functional/user_namespace/user_namespace_003.ksh \ functional/user_namespace/user_namespace_004.ksh \ functional/userquota/cleanup.ksh \ functional/userquota/groupspace_001_pos.ksh \ functional/userquota/groupspace_002_pos.ksh \ functional/userquota/groupspace_003_pos.ksh \ functional/userquota/groupspace_004_pos.ksh \ functional/userquota/setup.ksh \ functional/userquota/defaultuserquota_001_pos.ksh \ functional/userquota/defaultuserquota_002_pos.ksh \ functional/userquota/defaultuserquota_003_pos.ksh \ functional/userquota/defaultuserquota_004_neg.ksh \ functional/userquota/defaultuserquota_005_pos.ksh \ functional/userquota/defaultuserquota_006_pos.ksh \ functional/userquota/defaultuserquota_007_pos.ksh \ functional/userquota/defaultuserquota_008_pos.ksh \ functional/userquota/defaultuserquota_009_pos.ksh \ functional/userquota/defaultuserquota_010_neg.ksh \ functional/userquota/defaultuserquota_011_neg.ksh \ functional/userquota/defaultuserquota_012_neg.ksh \ functional/userquota/defaultuserquota_013_neg.ksh \ functional/userquota/userquota_001_pos.ksh \ functional/userquota/userquota_002_pos.ksh \ functional/userquota/userquota_003_pos.ksh \ functional/userquota/userquota_004_pos.ksh \ functional/userquota/userquota_005_neg.ksh \ functional/userquota/userquota_006_pos.ksh \ functional/userquota/userquota_007_pos.ksh \ functional/userquota/userquota_008_pos.ksh \ functional/userquota/userquota_009_pos.ksh \ functional/userquota/userquota_010_pos.ksh \ functional/userquota/userquota_011_pos.ksh \ functional/userquota/userquota_012_neg.ksh \ functional/userquota/userquota_013_pos.ksh \ functional/userquota/userspace_001_pos.ksh \ functional/userquota/userspace_002_pos.ksh \ functional/userquota/userspace_003_pos.ksh \ functional/userquota/userspace_004_pos.ksh \ functional/userquota/userspace_encrypted.ksh \ functional/userquota/userspace_send_encrypted.ksh \ functional/userquota/userspace_encrypted_13709.ksh \ functional/vdev_zaps/cleanup.ksh \ functional/vdev_zaps/setup.ksh \ functional/vdev_zaps/vdev_zaps_001_pos.ksh \ functional/vdev_zaps/vdev_zaps_002_pos.ksh \ functional/vdev_zaps/vdev_zaps_003_pos.ksh \ functional/vdev_zaps/vdev_zaps_004_pos.ksh \ functional/vdev_zaps/vdev_zaps_005_pos.ksh \ functional/vdev_zaps/vdev_zaps_006_pos.ksh \ functional/vdev_zaps/vdev_zaps_007_pos.ksh \ functional/write_dirs/cleanup.ksh \ functional/write_dirs/setup.ksh \ functional/write_dirs/write_dirs_001_pos.ksh \ functional/write_dirs/write_dirs_002_pos.ksh \ functional/xattr/cleanup.ksh \ functional/xattr/setup.ksh \ functional/xattr/xattr_001_pos.ksh \ functional/xattr/xattr_002_neg.ksh \ functional/xattr/xattr_003_neg.ksh \ functional/xattr/xattr_004_pos.ksh \ functional/xattr/xattr_005_pos.ksh \ functional/xattr/xattr_006_pos.ksh \ functional/xattr/xattr_007_neg.ksh \ functional/xattr/xattr_008_pos.ksh \ functional/xattr/xattr_009_neg.ksh \ functional/xattr/xattr_010_neg.ksh \ functional/xattr/xattr_011_pos.ksh \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ functional/zap_shrink/cleanup.ksh \ functional/zap_shrink/zap_shrink_001_pos.ksh \ functional/zap_shrink/setup.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ functional/zvol/zvol_cli/cleanup.ksh \ functional/zvol/zvol_cli/setup.ksh \ functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \ functional/zvol/zvol_ENOSPC/cleanup.ksh \ functional/zvol/zvol_ENOSPC/setup.ksh \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \ functional/zvol/zvol_misc/cleanup.ksh \ functional/zvol/zvol_misc/setup.ksh \ functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_fua.ksh \ functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \ functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \ functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \ functional/zvol/zvol_misc/zvol_misc_trim.ksh \ functional/zvol/zvol_misc/zvol_misc_volmode.ksh \ functional/zvol/zvol_misc/zvol_misc_zil.ksh \ functional/zvol/zvol_stress/cleanup.ksh \ functional/zvol/zvol_stress/setup.ksh \ functional/zvol/zvol_stress/zvol_stress.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \ functional/idmap_mount/cleanup.ksh \ functional/idmap_mount/setup.ksh \ functional/idmap_mount/idmap_mount_001.ksh \ functional/idmap_mount/idmap_mount_002.ksh \ functional/idmap_mount/idmap_mount_003.ksh \ functional/idmap_mount/idmap_mount_004.ksh \ functional/idmap_mount/idmap_mount_005.ksh diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh new file mode 100755 index 000000000000..3601f5422250 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that gang block functionality behaves correctly. +# +# Strategy: +# 1. Create a pool without dynamic gang headers. +# 2. Set metaslab_force_ganging to force gang blocks to be created. +# 3. Verify that gang blocks can be read, written, and freed. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Gang blocks behave correctly." + +preamble +log_onexit cleanup + +log_must zpool create -f $TESTPOOL $DISKS +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 100000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=128k count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l) +[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible" + +orig_checksum="$(cat $path | xxh128digest)" + +log_must verify_pool $TESTPOOL +log_must zinject -a +new_checksum="$(cat $path | xxh128digest)" +[[ "$orig_checksum" == "$new_checksum" ]] || log_fail "Checksum mismatch" + +log_must rm $path +log_must verify_pool $TESTPOOL + +log_pass "Gang blocks behave correctly."