diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dd521257ccb2..5e8f282e96c3 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1,9905 +1,9906 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright (c) 2015, 2017, Intel Corporation. * Copyright (c) 2020 Datto Inc. * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdb.h" extern int reference_tracking_enable; extern int zfs_recover; extern uint_t zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_metaslab = NULL; static unsigned zopt_metaslab_args = 0; static zopt_object_range_t *zopt_object_ranges = NULL; static unsigned zopt_object_args = 0; static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static zfs_range_tree_t *mos_refd_objs; static spa_t *spa; static objset_t *os; static boolean_t kernel_init_done; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); static void zdb_print_blkptr(const blkptr_t *bp, int flags); static void zdb_exit(int reason); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ blkptr_t svbr_blk; /* * Refcount gets incremented to 1 when we encounter the first * FREE entry for the svfbr block pointer and a node for it * is created in our ZDB verification/tracking metadata. * * As we encounter more FREE entries we increment this counter * and similarly decrement it whenever we find the respective * ALLOC entries for this block. * * When the refcount gets to 0 it means that all the FREE and * ALLOC entries of this block have paired up and we no longer * need to track it in our verification logic (e.g. the node * containing this struct in our verification data structure * should be freed). * * [refer to sublivelist_verify_blkptr() for the actual code] */ uint32_t svbr_refcnt; } sublivelist_verify_block_refcnt_t; static int sublivelist_block_refcnt_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_refcnt_t *l = larg; const sublivelist_verify_block_refcnt_t *r = rarg; return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); } static int sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx) { ASSERT3P(tx, ==, NULL); struct sublivelist_verify *sv = arg; sublivelist_verify_block_refcnt_t current = { .svbr_blk = *bp, /* * Start with 1 in case this is the first free entry. * This field is not used for our B-Tree comparisons * anyway. */ .svbr_refcnt = 1, }; zfs_btree_index_t where; sublivelist_verify_block_refcnt_t *pair = zfs_btree_find(&sv->sv_pair, ¤t, &where); if (free) { if (pair == NULL) { /* first free entry for this block pointer */ zfs_btree_add(&sv->sv_pair, ¤t); } else { pair->svbr_refcnt++; } } else { if (pair == NULL) { /* block that is currently marked as allocated */ for (int i = 0; i < SPA_DVAS_PER_BP; i++) { if (DVA_IS_EMPTY(&bp->blk_dva[i])) break; sublivelist_verify_block_t svb = { .svb_dva = bp->blk_dva[i], .svb_allocated_txg = BP_GET_LOGICAL_BIRTH(bp) }; if (zfs_btree_find(&sv->sv_leftover, &svb, &where) == NULL) { zfs_btree_add_idx(&sv->sv_leftover, &svb, &where); } } } else { /* alloc matches a free entry */ pair->svbr_refcnt--; if (pair->svbr_refcnt == 0) { /* all allocs and frees have been matched */ zfs_btree_remove_idx(&sv->sv_pair, &where); } } } return (0); } static int sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) { int err; struct sublivelist_verify *sv = args; zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, sv, NULL); sublivelist_verify_block_refcnt_t *e; zfs_btree_index_t *cookie = NULL; while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &e->svbr_blk, B_TRUE); (void) printf("\tERROR: %d unmatched FREE(s): %s\n", e->svbr_refcnt, blkbuf); } zfs_btree_destroy(&sv->sv_pair); return (err); } static int livelist_block_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_t *l = larg; const sublivelist_verify_block_t *r = rarg; if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) return (-1); else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) return (+1); if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) return (-1); else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) return (+1); if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) return (-1); else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) return (+1); return (0); } /* * Check for errors in a livelist while tracking all unfreed ALLOCs in the * sublivelist_verify_t: sv->sv_leftover */ static void livelist_verify(dsl_deadlist_t *dl, void *arg) { sublivelist_verify_t *sv = arg; dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); } /* * Check for errors in the livelist entry and discard the intermediary * data structures */ static int sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); zfs_btree_destroy(&sv.sv_leftover); return (err); } typedef struct metaslab_verify { /* * Tree containing all the leftover ALLOCs from the livelists * that are part of this metaslab. */ zfs_btree_t mv_livelist_allocs; /* * Metaslab information. */ uint64_t mv_vdid; uint64_t mv_msid; uint64_t mv_start; uint64_t mv_end; /* * What's currently allocated for this metaslab. */ zfs_range_tree_t *mv_allocated; } metaslab_verify_t; typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg); typedef struct unflushed_iter_cb_arg { spa_t *uic_spa; uint64_t uic_txg; void *uic_arg; zdb_log_sm_cb_t uic_cb; } unflushed_iter_cb_arg_t; static int iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) { unflushed_iter_cb_arg_t *uic = arg; return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); } static void iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); unflushed_iter_cb_arg_t uic = { .uic_spa = spa, .uic_txg = sls->sls_txg, .uic_arg = arg, .uic_cb = cb }; VERIFY0(space_map_iterate(sm, space_map_length(sm), iterate_through_spacemap_logs_cb, &uic)); space_map_close(sm); } spa_config_exit(spa, SCL_CONFIG, FTAG); } static void verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, uint64_t offset, uint64_t size) { sublivelist_verify_block_t svb = {{{0}}}; DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); DVA_SET_OFFSET(&svb.svb_dva, offset); DVA_SET_ASIZE(&svb.svb_dva, size); zfs_btree_index_t where; uint64_t end_offset = offset + size; /* * Look for an exact match for spacemap entry in the livelist entries. * Then, look for other livelist entries that fall within the range * of the spacemap entry as it may have been condensed */ sublivelist_verify_block_t *found = zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); if (found == NULL) { found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); } for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && DVA_GET_OFFSET(&found->svb_dva) < end_offset; found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { if (found->svb_allocated_txg <= txg) { (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " "from TXG %llx FREED at TXG %llx\n", (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), (u_longlong_t)found->svb_allocated_txg, (u_longlong_t)txg); } } } static int metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t txg = sme->sme_txg; if (sme->sme_type == SM_ALLOC) { if (zfs_range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE ALLOC: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { zfs_range_tree_add(mv->mv_allocated, offset, size); } } else { if (!zfs_range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE FREE: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { zfs_range_tree_remove(mv->mv_allocated, offset, size); } } if (sme->sme_type != SM_ALLOC) { /* * If something is freed in the spacemap, verify that * it is not listed as allocated in the livelist. */ verify_livelist_allocs(mv, txg, offset, size); } return (0); } static int spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); if (vdev_id != mv->mv_vdid) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; if (ms->ms_id != mv->mv_msid) return (0); if (txg < metaslab_unflushed_txg(ms)) return (0); ASSERT3U(txg, ==, sme->sme_txg); return (metaslab_spacemap_validation_cb(sme, mv)); } static void spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) { iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); } static void spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) { if (sm == NULL) return; VERIFY0(space_map_iterate(sm, space_map_length(sm), metaslab_spacemap_validation_cb, mv)); } static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); /* * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if * they are part of that metaslab (mv_msid). */ static void mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) { zfs_btree_index_t where; sublivelist_verify_block_t *svb; ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); for (svb = zfs_btree_first(&sv->sv_leftover, &where); svb != NULL; svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) continue; if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && (DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) continue; if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) continue; if ((DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } zfs_btree_add(&mv->mv_livelist_allocs, svb); } for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); svb != NULL; svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { zfs_btree_remove(&sv->sv_leftover, svb); } } /* * [Livelist Check] * Iterate through all the sublivelists and: * - report leftover frees (**) * - record leftover ALLOCs together with their TXG [see Cross Check] * * (**) Note: Double ALLOCs are valid in datasets that have dedup * enabled. Similarly double FREEs are allowed as well but * only if they pair up with a corresponding ALLOC entry once * we our done with our sublivelist iteration. * * [Spacemap Check] * for each metaslab: * - iterate over spacemap and then the metaslab's entries in the * spacemap log, then report any double FREEs and ALLOCs (do not * blow up). * * [Cross Check] * After finishing the Livelist Check phase and while being in the * Spacemap Check phase, we find all the recorded leftover ALLOCs * of the livelist check that are part of the metaslab that we are * currently looking at in the Spacemap Check. We report any entries * that are marked as ALLOCs in the livelists but have been actually * freed (and potentially allocated again) after their TXG stamp in * the spacemaps. Also report any ALLOCs from the livelists that * belong to indirect vdevs (e.g. their vdev completed removal). * * Note that this will miss Log Spacemap entries that cancelled each other * out before being flushed to the metaslab, so we are not guaranteed * to match all erroneous ALLOCs. */ static void livelist_metaslab_validate(spa_t *spa) { (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); (void) printf("Verifying metaslab entries\n"); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { metaslab_t *m = vd->vdev_ms[mid]; (void) fprintf(stderr, "\rverifying concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)mid, (longlong_t)vd->vdev_ms_count); uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; mv.mv_allocated = zfs_range_tree_create(NULL, type, NULL, start, shift); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); spacemap_check_ms_sm(m->ms_sm, &mv); spacemap_check_sm_log(spa, &mv); zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL); zfs_range_tree_destroy(mv.mv_allocated); zfs_btree_clear(&mv.mv_livelist_allocs); zfs_btree_destroy(&mv.mv_livelist_allocs); } } (void) fprintf(stderr, "\n"); /* * If there are any segments in the leftover tree after we walked * through all the metaslabs in the concrete vdevs then this means * that we have segments in the livelists that belong to indirect * vdevs and are marked as allocated. */ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { zfs_btree_destroy(&sv.sv_leftover); return; } (void) printf("ERROR: Found livelist blocks marked as allocated " "for indirect vdevs:\n"); zfs_btree_index_t *where = NULL; sublivelist_verify_block_t *svb; while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != NULL) { int vdev_id = DVA_GET_VDEV(&svb->svb_dva); ASSERT3U(vdev_id, <, rvd->vdev_children); vdev_t *vd = rvd->vdev_child[vdev_id]; ASSERT(!vdev_is_concrete(vd)); (void) printf("<%d:%llx:%llx> TXG %llx\n", vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), (u_longlong_t)svb->svb_allocated_txg); } (void) printf("\n"); zfs_btree_destroy(&sv.sv_leftover); } /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ]\n" "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" "\t%s -B [-e [-V] [-p ...]] [-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ] / []\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ] []\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O [-K ] \n" "\t%s -r [-K ] \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers or object number " "ranges are specified, only those\n" " objects or ranges are dumped.\n\n"); (void) fprintf(stderr, " Object ranges take the form :[:]\n" " start Starting object number\n" " end Ending object number, or -1 for no upper bound\n" " flags Optional flags to select object types:\n" " A All objects (this is the default)\n" " d ZFS directories\n" " f ZFS files \n" " m SPA space maps\n" " z ZAPs\n" " - Negate effect of next flag\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); (void) fprintf(stderr, " -B --backup " "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " "config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d --datasets " "dataset(s)\n"); (void) fprintf(stderr, " -D --dedup-stats " "dedup statistics\n"); (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n" " decode and display block " "from an embedded block pointer\n"); (void) fprintf(stderr, " -h --history " "pool history\n"); (void) fprintf(stderr, " -i --intent-logs " "intent logs\n"); (void) fprintf(stderr, " -l --label " "read label contents\n"); (void) fprintf(stderr, " -k --checkpointed-state " "examine the checkpointed state of the pool\n"); (void) fprintf(stderr, " -L --disable-leak-tracking " "disable leak tracking (do not load spacemaps)\n"); (void) fprintf(stderr, " -m --metaslabs " "metaslabs\n"); (void) fprintf(stderr, " -M --metaslab-groups " "metaslab groups\n"); (void) fprintf(stderr, " -O --object-lookups " "perform object lookups by path\n"); (void) fprintf(stderr, " -r --copy-object " "copy an object by path to file\n"); (void) fprintf(stderr, " -R --read-block " "read and display block from a device\n"); (void) fprintf(stderr, " -s --io-stats " "report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S --simulate-dedup " "simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v --verbose " "verbose (applies to all others)\n"); (void) fprintf(stderr, " -y --livelist " "perform livelist and metaslab validation on any livelists being " "deleted\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A --ignore-assertions " "ignore assertions (-A), enable panic recovery (-AA) or both " "(-AAA)\n"); (void) fprintf(stderr, " -e --exported " "pool is exported/destroyed/has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F --automatic-rewind " "attempt automatic rewind within safe range of transaction " "groups\n"); (void) fprintf(stderr, " -G --dump-debug-msg " "dump zfs_dbgmsg buffer before exiting\n"); (void) fprintf(stderr, " -I --inflight=INTEGER " "specify the maximum number of checksumming I/Os " "[default is 200]\n"); (void) fprintf(stderr, " -K --key=KEY " "decryption key for encrypted dataset\n"); (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" " "set global variable to an unsigned 32-bit integer\n"); (void) fprintf(stderr, " -p --path==PATH " "use one or more with -e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P --parseable " "print numbers in parseable form\n"); (void) fprintf(stderr, " -q --skip-label " "don't print label contents\n"); (void) fprintf(stderr, " -t --txg=INTEGER " "highest txg to use when searching for uberblocks\n"); (void) fprintf(stderr, " -T --brt-stats " "BRT statistics\n"); (void) fprintf(stderr, " -u --uberblock " "uberblock\n"); (void) fprintf(stderr, " -U --cachefile=PATH " "use alternate cachefile\n"); (void) fprintf(stderr, " -V --verbatim " "do verbatim import\n"); (void) fprintf(stderr, " -x --dump-blocks=PATH " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X --extreme-rewind " "attempt extreme rewind (does not work with dataset)\n"); (void) fprintf(stderr, " -Y --all-reconstruction " "attempt all reconstruction combinations for split blocks\n"); (void) fprintf(stderr, " -Z --zstd-headers " "show ZSTD headers \n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); zdb_exit(1); } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!dump_opt['G']) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDERR_FILENO, "\n", 1); zfs_dbgmsg_print(STDERR_FILENO, "zdb"); } static void sig_handler(int signo) { struct sigaction action; libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); zdb_exit(1); } static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { (void) size; nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) size; spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, buflen); } static void zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes); else zfs_nicebytes(bytes, buf, buflen); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] == 0) continue; if (histo[i] > max) max = histo[i]; if (i > maxidx) maxidx = i; if (i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; (void) printf("\tUNKNOWN OBJECT TYPE\n"); } static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { uint64_t *arr; uint64_t oursize; if (dump_opt['d'] < 6) return; if (data == NULL) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); size = doi.doi_max_offset; /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); arr = kmem_alloc(oursize, KM_SLEEP); int err = dmu_read(os, object, 0, oursize, arr, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(arr, oursize); return; } } else { /* * Even though the allocation is already done in this code path, * we still cap the size to prevent excessive printing. */ oursize = MIN(size, 1 << 20); arr = data; } if (size == 0) { if (data == NULL) kmem_free(arr, oursize); (void) printf("\t\t[]\n"); return; } (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { if (i % 4 != 0) (void) printf(", %0llx", (u_longlong_t)arr[i]); else (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); } if (oursize != size) (void) printf(", ... "); (void) printf("]\n"); if (data == NULL) kmem_free(arr, oursize); } static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t *attrp = zap_attribute_long_alloc(); void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, attrp) == 0; zap_cursor_advance(&zc)) { boolean_t key64 = !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); if (key64) (void) printf("\t\t0x%010" PRIu64 "x = ", *(uint64_t *)attrp->za_name); else (void) printf("\t\t%s = ", attrp->za_name); if (attrp->za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attrp->za_num_integers * attrp->za_integer_length, UMEM_NOFAIL); if (key64) (void) zap_lookup_uint64(os, object, (const uint64_t *)attrp->za_name, 1, attrp->za_integer_length, attrp->za_num_integers, prop); else (void) zap_lookup(os, object, attrp->za_name, attrp->za_integer_length, attrp->za_num_integers, prop); if (attrp->za_integer_length == 1 && !key64) { if (strcmp(attrp->za_name, DSL_CRYPTO_KEY_MASTER_KEY) == 0 || strcmp(attrp->za_name, DSL_CRYPTO_KEY_HMAC_KEY) == 0 || strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 || strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 || strcmp(attrp->za_name, DMU_POOL_CHECKSUM_SALT) == 0) { uint8_t *u8 = prop; for (i = 0; i < attrp->za_num_integers; i++) { (void) printf("%02x", u8[i]); } } else { (void) printf("%s", (char *)prop); } } else { for (i = 0; i < attrp->za_num_integers; i++) { switch (attrp->za_integer_length) { case 1: (void) printf("%u ", ((uint8_t *)prop)[i]); break; case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attrp->za_num_integers * attrp->za_integer_length); } zap_cursor_fini(&zc); zap_attribute_free(attrp); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; uint64_t i; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (size >= sizeof (*bpop)) { (void) printf("\t\tnum_freed = %llu\n", (u_longlong_t)bpop->bpo_num_freed); } if (dump_opt['d'] < 5) return; for (i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; dmu_object_info_t doi; int64_t i; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t *attrp = zap_attribute_alloc(); dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, attrp) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attrp->za_name); if (attrp->za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attrp->za_first_integer, (int)ATTR_LENGTH(attrp->za_first_integer), (int)ATTR_BSWAP(attrp->za_first_integer), (int)ATTR_NUM(attrp->za_first_integer)); } zap_cursor_fini(&zc); zap_attribute_free(attrp); } static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t *attrp = zap_attribute_alloc(); uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, attrp) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attrp->za_name); if (attrp->za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attrp->za_integer_length == 2); layout_attrs = umem_zalloc(attrp->za_num_integers * attrp->za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attrp->za_name, attrp->za_integer_length, attrp->za_num_integers, layout_attrs) == 0); for (i = 0; i != attrp->za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attrp->za_num_integers * attrp->za_integer_length); } zap_cursor_fini(&zc); zap_attribute_free(attrp); } static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t *attrp = zap_attribute_long_alloc(); const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, attrp) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer), typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]); } zap_cursor_fini(&zc); zap_attribute_free(attrp); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { uint64_t obsolete_sm_object; int refcount = 0; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (vd->vdev_top == vd && obsolete_sm_object != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_object, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_object, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int get_log_spacemap_refcount(spa_t *spa) { return (avl_numnodes(&spa->spa_sm_logs_by_txg)); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", (longlong_t)sm->sm_object); (void) printf(" smp_length = 0x%llx\n", (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); if (dump_opt['d'] < 6 && dump_opt['m'] < 4) return; /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); if (de_txg == 0) { (void) printf( "\t [%6llu] PADDING\n", (u_longlong_t)entry_id); } else { (void) printf( "\t [%6llu] %s: txg %llu pass %llu\n", (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)de_txg, (u_longlong_t)de_sync_pass); } entry_id++; continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; entry_id++; } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); - dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", (u_longlong_t)metaslab_unflushed_txg(msp)); } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *bias_str = ""; if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { bias_str = VDEV_ALLOC_BIAS_LOG; } else if (alloc_bias == VDEV_BIAS_SPECIAL) { bias_str = VDEV_ALLOC_BIAS_SPECIAL; } else if (alloc_bias == VDEV_BIAS_DEDUP) { bias_str = VDEV_ALLOC_BIAS_DEDUP; } uint64_t ms_flush_data_obj = 0; if (vd->vdev_top_zap != 0) { int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &ms_flush_data_obj); if (error != ENOENT) { ASSERT0(error); } } (void) printf("\tvdev %10llu %s", (u_longlong_t)vd->vdev_id, bias_str); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", (u_longlong_t)ms_flush_data_obj); } (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", "---------------", "------------"); } static void dump_metaslab_groups(spa_t *spa, boolean_t show_special) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); metaslab_class_t *smc = spa_special_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || (mg->mg_class != mc && (!show_special || mg->mg_class != smc))) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } - dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(mg->mg_histogram, + ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); - dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_metaslab_args > 0) { c = zopt_metaslab[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_metaslab_args > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_metaslab_args; m++) { if (zopt_metaslab[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_metaslab[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_metaslab[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_log_spacemaps(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; (void) printf("\nLog Space Maps in Pool:\n"); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); (void) printf("Log Spacemap object %llu txg %llu\n", (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); dump_spacemap(spa->spa_meta_objset, sm); space_map_close(sm); } (void) printf("\n"); } static void dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, uint64_t index) { const ddt_key_t *ddk = &ddlwe->ddlwe_key; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; for (p = 0; p < DDT_NPHYS(ddt); p++) { const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu phys %d %s\n", (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), p, blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt_log(ddt_t *ddt) { if (ddt->ddt_version != DDT_VERSION_FDT || !(ddt->ddt_flags & DDT_FLAG_LOG)) return; for (int n = 0; n < 2; n++) { ddt_log_t *ddl = &ddt->ddt_log[n]; char flagstr[64] = {0}; if (ddl->ddl_flags > 0) { flagstr[0] = ' '; int c = 1; if (ddl->ddl_flags & DDL_FLAG_FLUSHING) c += strlcpy(&flagstr[c], " FLUSHING", sizeof (flagstr) - c); if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) c += strlcpy(&flagstr[c], " CHECKPOINT", sizeof (flagstr) - c); if (ddl->ddl_flags & ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT)) c += strlcpy(&flagstr[c], " UNKNOWN", sizeof (flagstr) - c); flagstr[1] = '['; flagstr[c++] = ']'; } uint64_t count = avl_numnodes(&ddl->ddl_tree); printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; " "len=%llu; txg=%llu; entries=%llu\n", zio_checksum_table[ddt->ddt_checksum].ci_name, n, ddl->ddl_flags, flagstr, (u_longlong_t)ddl->ddl_object, (u_longlong_t)ddl->ddl_length, (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count); if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) { const ddt_key_t *ddk = &ddl->ddl_checkpoint; printf(" checkpoint: " "%016llx:%016llx:%016llx:%016llx:%016llx\n", (u_longlong_t)ddk->ddk_cksum.zc_word[0], (u_longlong_t)ddk->ddk_cksum.zc_word[1], (u_longlong_t)ddk->ddk_cksum.zc_word[2], (u_longlong_t)ddk->ddk_cksum.zc_word[3], (u_longlong_t)ddk->ddk_prop); } if (count == 0 || dump_opt['D'] < 4) continue; ddt_lightweight_entry_t ddlwe; uint64_t index = 0; for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); dump_ddt_entry(ddt, &ddlwe, index++); } } } static void dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; ddt_lightweight_entry_t ddlwe; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name, (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0) dump_ddt_entry(ddt, &ddlwe, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_ddt(ddt_t *ddt) { if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) return; char flagstr[64] = {0}; if (ddt->ddt_flags > 0) { flagstr[0] = ' '; int c = 1; if (ddt->ddt_flags & DDT_FLAG_FLAT) c += strlcpy(&flagstr[c], " FLAT", sizeof (flagstr) - c); if (ddt->ddt_flags & DDT_FLAG_LOG) c += strlcpy(&flagstr[c], " LOG", sizeof (flagstr) - c); if (ddt->ddt_flags & ~DDT_FLAG_MASK) c += strlcpy(&flagstr[c], " UNKNOWN", sizeof (flagstr) - c); flagstr[1] = '['; flagstr[c] = ']'; } printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n", zio_checksum_table[ddt->ddt_checksum].ci_name, (u_longlong_t)ddt->ddt_version, (ddt->ddt_version == 0) ? "LEGACY" : (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN", (u_longlong_t)ddt->ddt_flags, flagstr, (u_longlong_t)ddt->ddt_dir_object); for (ddt_type_t type = 0; type < DDT_TYPES; type++) for (ddt_class_t class = 0; class < DDT_CLASSES; class++) dump_ddt_object(ddt, type, class); dump_ddt_log(ddt); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) dump_ddt(spa->spa_ddt[c]); ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); /* * Dump a histogram of unique class entry age */ if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { ddt_age_histo_t histogram; (void) printf("DDT walk unique, building age histogram...\n"); ddt_prune_walk(spa, 0, &histogram); /* * print out histogram for unique entry class birth */ if (histogram.dah_entries > 0) { (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); for (int i = 0; i < HIST_BINS; i++) { (void) printf("%5d %9d %4d%%\n", 1 << i, (int)histogram.dah_age_histo[i], (int)((histogram.dah_age_histo[i] * 100) / histogram.dah_entries)); } } } } static void dump_brt(spa_t *spa) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) { printf("BRT: unsupported on this pool\n"); return; } if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { printf("BRT: empty\n"); return; } char count[32], used[32], saved[32]; zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); uint64_t ratio = brt_get_ratio(spa); printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved, (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100)); if (dump_opt['T'] < 2) return; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (!brtvd->bv_initiated) { printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; } zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n", vdevid, count, used, saved); } if (dump_opt['T'] < 3) return; /* -TTT shows a per-vdev histograms; -TTTT shows all entries */ boolean_t do_histo = dump_opt['T'] == 3; char dva[64]; if (!do_histo) printf("\n%-16s %-10s\n", "DVA", "REFCNT"); for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (!brtvd->bv_initiated) continue; uint64_t counts[64] = {}; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, brtvd->bv_mos_entries); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t refcnt; VERIFY0(zap_lookup_uint64(spa->spa_meta_objset, brtvd->bv_mos_entries, (const uint64_t *)za->za_name, 1, za->za_integer_length, za->za_num_integers, &refcnt)); if (do_histo) counts[highbit64(refcnt)]++; else { uint64_t offset = *(const uint64_t *)za->za_name; snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, (u_longlong_t)offset); printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); } } zap_cursor_fini(&zc); zap_attribute_free(za); if (do_histo) { printf("\nBRT: vdev %" PRIu64 ": DVAs with 2^n refcnts:\n", vdevid); dump_histogram(counts, 64, 0); } } } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; if (zfs_range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); zfs_range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf; uint64_t resid, len, off = 0; uint_t num = 0; int error; char tbuf[30]; if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", __func__); return; } do { len = SPA_OLD_MAXBLOCKSIZE; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); free(buf); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; } while (len != 0); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { boolean_t printed = B_FALSE; if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } else { tbuf[0] = '\0'; } if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { (void) printf("%s %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { uint64_t ievent; ievent = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) printf(" %s [internal %s txg:%ju] %s\n", tbuf, zfs_history_event_names[ievent], fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { (void) printf("%s [txg:%ju] %s", tbuf, fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_NAME)); if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(events[i], ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64( events[i], ZPOOL_HIST_DSID)); } (void) printf(" %s\n", fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_IOCTL)); if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(events[i], ZPOOL_HIST_ERRNO)); } } else { goto next; } printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } free(buf); } static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, const blkptr_t *bp) { static abd_t *pabd = NULL; void *buf; zio_t *zio; zfs_zstdhdr_t zstd_hdr; int error; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) return; if (BP_IS_HOLE(bp)) return; if (BP_IS_EMBEDDED(bp)) { buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); zdb_exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); free(buf); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); return; } if (!pabd) pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); zio = zio_root(spa, NULL, NULL, 0); /* Decrypt but don't decompress so we can read the compression header */ zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, NULL)); error = zio_wait(zio); if (error) { (void) fprintf(stderr, "read failed: %d\n", error); return; } buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:NORMAL", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; int i; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); if (bp_freed) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); } return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); return; } blkbuf[0] = '\0'; for (i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), (u_longlong_t)BP_GET_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " cksum=%016llx:%016llx:%016llx:%016llx", (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); } } static void print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (BP_GET_LOGICAL_BIRTH(bp) == 0) return (0); print_indirect(spa, bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (int j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object; dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated"); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object; dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated"); _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ, "compressed truncated"); _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ, "uncompressed truncated"); _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated"); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; if (BP_GET_LOGICAL_BIRTH(bp) != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; uint64_t i; /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu freed, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } else { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu blkptrs, " "%llu freed, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, bytes); } else { (void) printf(" %*s: object %llu, %llu blkptrs, " "%s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static int dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, boolean_t print_list) { int err = 0; zfs_bookmark_phys_t prop; objset_t *mos = dp->dp_spa->spa_meta_objset; err = dsl_bookmark_lookup(dp, name, NULL, &prop); if (err != 0) { return (err); } (void) printf("\t#%s: ", strchr(name, '#') + 1); (void) printf("{guid: %llx creation_txg: %llu creation_time: " "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, (u_longlong_t)prop.zbm_creation_txg, (u_longlong_t)prop.zbm_creation_time, (u_longlong_t)prop.zbm_redaction_obj); IMPLY(print_list, print_redact); if (!print_redact || prop.zbm_redaction_obj == 0) return (0); redaction_list_t *rl; VERIFY0(dsl_redaction_list_hold_obj(dp, prop.zbm_redaction_obj, FTAG, &rl)); redaction_list_phys_t *rlp = rl->rl_phys; (void) printf("\tRedacted:\n\t\tProgress: "); if (rlp->rlp_last_object != UINT64_MAX || rlp->rlp_last_blkid != UINT64_MAX) { (void) printf("%llu %llu (incomplete)\n", (u_longlong_t)rlp->rlp_last_object, (u_longlong_t)rlp->rlp_last_blkid); } else { (void) printf("complete\n"); } (void) printf("\t\tSnapshots: ["); for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { if (i > 0) (void) printf(", "); (void) printf("%0llu", (u_longlong_t)rlp->rlp_snaps[i]); } (void) printf("]\n\t\tLength: %llu\n", (u_longlong_t)rlp->rlp_num_entries); if (!print_list) { dsl_redaction_list_rele(rl, FTAG); return (0); } if (rlp->rlp_num_entries == 0) { dsl_redaction_list_rele(rl, FTAG); (void) printf("\t\tRedaction List: []\n\n"); return (0); } redact_block_phys_t *rbp_buf; uint64_t size; dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); size = doi.doi_max_offset; rbp_buf = kmem_alloc(size, KM_SLEEP); err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, rbp_buf, 0); if (err != 0) { dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); return (err); } (void) printf("\t\tRedaction List: [{object: %llx, offset: " "%llx, blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[0].rbp_object, (u_longlong_t)rbp_buf[0].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[0])), (u_longlong_t)redact_block_get_count(&rbp_buf[0])); for (size_t i = 1; i < rlp->rlp_num_entries; i++) { (void) printf(",\n\t\t{object: %llx, offset: %llx, " "blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[i].rbp_object, (u_longlong_t)rbp_buf[i].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[i])), (u_longlong_t)redact_block_get_count(&rbp_buf[i])); } dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); (void) printf("]\n\n"); return (0); } static void dump_bookmarks(objset_t *os, int verbosity) { zap_cursor_t zc; zap_attribute_t *attrp; dsl_dataset_t *ds = dmu_objset_ds(os); dsl_pool_t *dp = spa_get_dsl(os->os_spa); objset_t *mos = os->os_spa->spa_meta_objset; if (verbosity < 4) return; attrp = zap_attribute_alloc(); dsl_pool_config_enter(dp, FTAG); for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); zap_cursor_retrieve(&zc, attrp) == 0; zap_cursor_advance(&zc)) { char osname[ZFS_MAX_DATASET_NAME_LEN]; char buf[ZFS_MAX_DATASET_NAME_LEN]; int len; dmu_objset_name(os, osname); len = snprintf(buf, sizeof (buf), "%s#%s", osname, attrp->za_name); VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN); (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); } zap_cursor_fini(&zc); dsl_pool_config_exit(dp, FTAG); zap_attribute_free(attrp); } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static int dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) { spa_t *spa = arg; uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); return (0); } static int dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) { ASSERT(arg == NULL); if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } return (0); } static void dump_blkptr_list(dsl_deadlist_t *dl, const char *name) { char bytes[32]; char comp[32]; char uncomp[32]; char entries[32]; spa_t *spa = dmu_objset_spa(dl->dl_os); uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated"); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); (void) printf("\n %s: %s (%s/%s comp), %s entries\n", name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) putchar('\n'); dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); } static int verify_dd_livelist(objset_t *os) { uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; dsl_pool_t *dp = spa_get_dsl(os->os_spa); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; ASSERT(!dmu_objset_is_snapshot(os)); if (!dsl_deadlist_is_open(&dd->dd_livelist)) return (0); /* Iterate through the livelist to check for duplicates */ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, NULL); dsl_pool_config_enter(dp, FTAG); dsl_deadlist_space(&dd->dd_livelist, &ll_used, &ll_comp, &ll_uncomp); dsl_dataset_t *origin_ds; ASSERT(dsl_pool_config_held(dp)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, &used, &comp, &uncomp)); dsl_dataset_rele(origin_ds, FTAG); dsl_pool_config_exit(dp, FTAG); /* * It's possible that the dataset's uncomp space is larger than the * livelist's because livelists do not track embedded block pointers */ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { char nice_used[32], nice_comp[32], nice_uncomp[32]; (void) printf("Discrepancy in space accounting:\n"); zdb_nicenum(used, nice_used, sizeof (nice_used)); zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("dir: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("livelist: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); return (1); } return (0); } static char *key_material = NULL; static boolean_t zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) { uint64_t keyformat, salt, iters; int i; unsigned char c; VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), 1, &keyformat)); switch (keyformat) { case ZFS_KEYFORMAT_HEX: for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) { if (!isxdigit(key_material[i]) || !isxdigit(key_material[i+1])) return (B_FALSE); if (sscanf(&key_material[i], "%02hhx", &c) != 1) return (B_FALSE); key_out[i / 2] = c; } break; case ZFS_KEYFORMAT_PASSPHRASE: VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), sizeof (uint64_t), 1, &salt)); VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), sizeof (uint64_t), 1, &iters)); if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material), ((uint8_t *)&salt), sizeof (uint64_t), iters, WRAPPING_KEY_LEN, key_out) != 1) return (B_FALSE); break; default: fatal("no support for key format %u\n", (unsigned int) keyformat); } return (B_TRUE); } static char encroot[ZFS_MAX_DATASET_NAME_LEN]; static boolean_t key_loaded = B_FALSE; static void zdb_load_key(objset_t *os) { dsl_pool_t *dp; dsl_dir_t *dd, *rdd; uint8_t key[WRAPPING_KEY_LEN]; uint64_t rddobj; int err; dp = spa_get_dsl(os->os_spa); dd = os->os_dsl_dataset->ds_dir; dsl_pool_config_enter(dp, FTAG); VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj)); VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd)); dsl_dir_name(rdd, encroot); dsl_dir_rele(rdd, FTAG); if (!zdb_derive_key(dd, key)) fatal("couldn't derive encryption key"); dsl_pool_config_exit(dp, FTAG); ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE); dsl_crypto_params_t *dcp; nvlist_t *crypto_args; crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)key, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(encroot, dcp, B_FALSE); dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err != 0) fatal( "couldn't load encryption key for %s: %s", encroot, err == ZFS_ERR_CRYPTO_NOTSUP ? "crypto params not supported" : strerror(err)); ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE); printf("Unlocked encryption root: %s\n", encroot); key_loaded = B_TRUE; } static void zdb_unload_key(void) { if (!key_loaded) return; VERIFY0(spa_keystore_unload_wkey(encroot)); key_loaded = B_FALSE; } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, const void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); /* * We can't own an objset if it's redacted. Therefore, we do this * dance: hold the objset, then acquire a long hold on its dataset, then * release the pool (which is held as part of holding the objset). */ if (dump_opt['K']) { /* decryption requested, try to load keys */ err = dmu_objset_hold(path, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset " "'%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); /* succeeds or dies */ zdb_load_key(*osp); /* release it all */ dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele(dmu_objset_ds(*osp), tag); } int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0; err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && (key_loaded || !(*osp)->os_encrypted)) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele_flags(dmu_objset_ds(*osp), ds_hold_flags, tag); *osp = NULL; } } sa_os = *osp; return (err); } static void close_objset(objset_t *os, const void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dsl_dataset_long_rele(dmu_objset_ds(os), tag); dsl_dataset_rele_flags(dmu_objset_ds(os), key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag); sa_attr_table = NULL; sa_os = NULL; zdb_unload_key(); } static void fuid_table_destroy(void) { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on * a live pool are normally cleaned up during ddt_sync(). We can't do that (and * wouldn't want to anyway), but if we don't clean up the presence of stuff on * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. * * Note that this is not a particularly efficient way to do this, but * ddt_remove() is the only public method that can do the work we need, and it * requires the right locks and etc to do the job. This is only ever called * during zdb shutdown so efficiency is not especially important. */ static void zdb_ddt_cleanup(spa_t *spa) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (!ddt) continue; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ddt_enter(ddt); ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; while (dde) { next = AVL_NEXT(&ddt->ddt_tree, dde); dde->dde_io = NULL; ddt_remove(ddt, dde); dde = next; } ddt_exit(ddt); spa_config_exit(spa, SCL_CONFIG, FTAG); } } static void zdb_exit(int reason) { if (spa != NULL) zdb_ddt_cleanup(spa); if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { spa_close(spa, FTAG); } fuid_table_destroy(); if (kernel_init_done) kernel_fini(); exit(reason); } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { const char *domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } static void dump_znode_sa_xattr(sa_handle_t *hdl) { nvlist_t *sa_xattr; nvpair_t *elem = NULL; int sa_xattr_size = 0; int sa_xattr_entries = 0; int error; char *sa_xattr_packed; error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); if (error || sa_xattr_size == 0) return; sa_xattr_packed = malloc(sa_xattr_size); if (sa_xattr_packed == NULL) return; error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], sa_xattr_packed, sa_xattr_size); if (error) { free(sa_xattr_packed); return; } error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); if (error) { free(sa_xattr_packed); return; } while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) sa_xattr_entries++; (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { boolean_t can_print = !dump_opt['P']; uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); for (idx = 0; idx < cnt; ++idx) { if (!isprint(value[idx])) { can_print = B_FALSE; break; } } for (idx = 0; idx < cnt; ++idx) { if (can_print) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); } (void) putchar('\n'); } nvlist_free(sa_xattr); free(sa_xattr_packed); } static void dump_znode_symlink(sa_handle_t *hdl) { int sa_symlink_size = 0; char linktarget[MAXPATHLEN]; int error; error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); if (error || sa_symlink_size == 0) { return; } if (sa_symlink_size >= sizeof (linktarget)) { (void) printf("symlink size %d is too large\n", sa_symlink_size); return; } linktarget[sa_symlink_size] = '\0'; if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], &linktarget, sa_symlink_size) == 0) (void) printf("\ttarget %s\n", linktarget); } static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } if (S_ISLNK(mode)) dump_znode_symlink(hdl); dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { uint64_t projid; if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, sizeof (uint64_t)) == 0) (void) printf("\tprojid %llu\n", (u_longlong_t)projid); } if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); dump_znode_sa_xattr(hdl); sa_handle_destroy(hdl); } static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group/project used */ dump_zap, /* ZFS user/group/project quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static boolean_t match_object_type(dmu_object_type_t obj_type, uint64_t flags) { boolean_t match = B_TRUE; switch (obj_type) { case DMU_OT_DIRECTORY_CONTENTS: if (!(flags & ZOR_FLAG_DIRECTORY)) match = B_FALSE; break; case DMU_OT_PLAIN_FILE_CONTENTS: if (!(flags & ZOR_FLAG_PLAIN_FILE)) match = B_FALSE; break; case DMU_OT_SPACE_MAP: if (!(flags & ZOR_FLAG_SPACE_MAP)) match = B_FALSE; break; default: if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { if (!(flags & ZOR_FLAG_ZAP)) match = B_FALSE; break; } /* * If all bits except some of the supported flags are * set, the user combined the all-types flag (A) with * a negated flag to exclude some types (e.g. A-f to * show all object types except plain files). */ if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) match = B_FALSE; break; } return (match); } static void dump_object(objset_t *os, uint64_t object, int verbosity, boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; boolean_t dnode_held = B_FALSE; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated"); _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated"); _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ, "bonus_size truncated"); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); dmu_object_info_from_dnode(dn, &doi); } else { /* * Encrypted datasets will have sensitive bonus buffers * encrypted. Therefore we cannot hold the bonus buffer and * must hold the dnode itself instead. */ error = dmu_object_info(os, object, &doi); if (error) fatal("dmu_object_info() failed, errno %u", error); if (!key_loaded && os->os_encrypted && DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { error = dnode_hold(os, object, FTAG, &dn); if (error) fatal("dnode_hold() failed, errno %u", error); dnode_held = B_TRUE; } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } } /* * Default to showing all object types if no flags were specified. */ if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && !match_object_type(doi.doi_type, flags)) goto out; if (dnode_slots_used) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress == ZIO_COMPRESS_INHERIT && ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { const char *compname = NULL; if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), &compname) == 0) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", compname); } else { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s-unknown)", ZDB_COMPRESS_NAME(os->os_compress)); } } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", zdb_ot_name(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? "USEROBJUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); if (!dnode_held) { object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); } else { (void) printf("\t\t(bonus encrypted)\n"); } if (key_loaded || (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); } else { (void) printf("\t\t(object encrypted)\n"); } *print_header = B_TRUE; } if (verbosity >= 5) { if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE); (void) printf("\nSpill block: %s\n", blkbuf); } dump_indirect(dn); } if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ, "segsize truncated"); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } out: if (db != NULL) dmu_buf_rele(db, FTAG); if (dnode_held) dnode_rele(dn, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); /* * The dd_crypto_obj can be referenced by multiple dsl_dir's. * Ignore the references after the first one. */ mos_obj_refd_multiple(dd->dd_crypto_obj); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); mos_obj_refd(ds->ds_bookmarks_obj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *const objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; /* * Parse a string denoting a range of object IDs of the form * [:[:flags]], and store the results in zor. * Return 0 on success. On error, return 1 and update the msg * pointer to point to a descriptive error message. */ static int parse_object_range(char *range, zopt_object_range_t *zor, const char **msg) { uint64_t flags = 0; char *p, *s, *dup, *flagstr, *tmp = NULL; size_t len; int i; int rc = 0; if (strchr(range, ':') == NULL) { zor->zor_obj_start = strtoull(range, &p, 0); if (*p != '\0') { *msg = "Invalid characters in object ID"; rc = 1; } zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = zor->zor_obj_start; return (rc); } if (strchr(range, ':') == range) { *msg = "Invalid leading colon"; rc = 1; return (rc); } len = strlen(range); if (range[len - 1] == ':') { *msg = "Invalid trailing colon"; rc = 1; return (rc); } dup = strdup(range); s = strtok_r(dup, ":", &tmp); zor->zor_obj_start = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in start object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); zor->zor_obj_end = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in end object ID"; rc = 1; goto out; } if (zor->zor_obj_start > zor->zor_obj_end) { *msg = "Start object ID may not exceed end object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); if (s == NULL) { zor->zor_flags = ZOR_FLAG_ALL_TYPES; goto out; } else if (strtok_r(NULL, ":", &tmp) != NULL) { *msg = "Invalid colon-delimited field after flags"; rc = 1; goto out; } flagstr = s; for (i = 0; flagstr[i]; i++) { int bit; boolean_t negation = (flagstr[i] == '-'); if (negation) { i++; if (flagstr[i] == '\0') { *msg = "Invalid trailing negation operator"; rc = 1; goto out; } } bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { *msg = "Invalid flag"; rc = 1; goto out; } if (negation) flags &= ~bit; else flags |= bit; } zor->zor_flags = flags; zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); out: free(dup); return (rc); } static void dump_objset(objset_t *os) { dmu_objset_stats_t dds = { 0 }; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; uint64_t obj_start; uint64_t obj_end; uint64_t flags; /* make sure nicenum has enough space */ _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated"); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); print_header = B_TRUE; if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); for (i = 0; i < zopt_object_args; i++) { obj_start = zopt_object_ranges[i].zor_obj_start; obj_end = zopt_object_ranges[i].zor_obj_end; flags = zopt_object_ranges[i].zor_flags; object = obj_start; if (object == 0 || obj_start == obj_end) dump_object(os, object, verbosity, &print_header, NULL, flags); else object--; while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && object <= obj_end) { dump_object(os, object, verbosity, &print_header, NULL, flags); } } if (zopt_object_args > 0) { (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); if (verify_dd_livelist(os) != 0) fatal("livelist is incorrect"); } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } if (dmu_objset_ds(os) != NULL) dump_bookmarks(os, verbosity); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header, NULL, 0); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, NULL, 0); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, NULL, 0); } if (DMU_PROJECTUSED_DNODE(os) != NULL && DMU_PROJECTUSED_DNODE(os)->dn_type != 0) dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, &print_header, NULL, 0); object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header, &dnode_slots, 0); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); (void) printf(" Dnode slots:\n"); (void) printf("\tTotal used: %10llu\n", (u_longlong_t)total_slots_used); (void) printf("\tMax used: %10llu\n", (u_longlong_t)max_slot_used); (void) printf("\tPercent empty: %10lf\n", (double)(max_slot_used - total_slots_used)*100 / (double)max_slot_used); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } ASSERT3U(object_count, ==, usedobjs); if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\tbp = %s\n", blkbuf); (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { (void) printf("\tmmp_delay = %0llu\n", (u_longlong_t)ub->ub_mmp_delay); if (MMP_SEQ_VALID(ub)) (void) printf("\tmmp_seq = %u\n", (unsigned int) MMP_SEQ(ub)); if (MMP_FAIL_INT_VALID(ub)) (void) printf("\tmmp_fail = %u\n", (unsigned int) MMP_FAIL_INT(ub)); if (MMP_INTERVAL_VALID(ub)) (void) printf("\tmmp_write = %u\n", (unsigned int) MMP_INTERVAL(ub)); /* After MMP_* to make summarize_uberblock_mmp cleaner */ (void) printf("\tmmp_valid = %x\n", (unsigned int) ub->ub_mmp_config & 0xFF); } if (dump_opt['u'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("\traidz_reflow state=%u off=%llu\n", (int)RRSS_GET_STATE(ub), (u_longlong_t)RRSS_GET_OFFSET(ub)); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); zdb_exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); zdb_exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); zdb_exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); zdb_exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); zdb_exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } /* * ZFS label nvlist stats */ typedef struct zdb_nvl_stats { int zns_list_count; int zns_leaf_count; size_t zns_leaf_largest; size_t zns_leaf_total; nvlist_t *zns_string; nvlist_t *zns_uint64; nvlist_t *zns_boolean; } zdb_nvl_stats_t; static void collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) { nvlist_t *list, **array; nvpair_t *nvp = NULL; const char *name; uint_t i, items; stats->zns_list_count++; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { name = nvpair_name(nvp); switch (nvpair_type(nvp)) { case DATA_TYPE_STRING: fnvlist_add_string(stats->zns_string, name, fnvpair_value_string(nvp)); break; case DATA_TYPE_UINT64: fnvlist_add_uint64(stats->zns_uint64, name, fnvpair_value_uint64(nvp)); break; case DATA_TYPE_BOOLEAN: fnvlist_add_boolean(stats->zns_boolean, name); break; case DATA_TYPE_NVLIST: if (nvpair_value_nvlist(nvp, &list) == 0) collect_nvlist_stats(list, stats); break; case DATA_TYPE_NVLIST_ARRAY: if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) break; for (i = 0; i < items; i++) { collect_nvlist_stats(array[i], stats); /* collect stats on leaf vdev */ if (strcmp(name, "children") == 0) { size_t size; (void) nvlist_size(array[i], &size, NV_ENCODE_XDR); stats->zns_leaf_total += size; if (size > stats->zns_leaf_largest) stats->zns_leaf_largest = size; stats->zns_leaf_count++; } } break; default: (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); } } } static void dump_nvlist_stats(nvlist_t *nvl, size_t cap) { zdb_nvl_stats_t stats = { 0 }; size_t size, sum = 0, total; size_t noise; /* requires nvlist with non-unique names for stat collection */ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); (void) printf("\n\nZFS Label NVList Config Stats:\n"); VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", (int)total, (int)(cap - total), 100.0 * total / cap); collect_nvlist_stats(nvl, &stats); VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", (int)fnvlist_num_pairs(stats.zns_uint64), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", (int)fnvlist_num_pairs(stats.zns_string), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", (int)fnvlist_num_pairs(stats.zns_boolean), (int)size, 100.0 * size / total); size = total - sum; /* treat remainder as nvlist overhead */ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", stats.zns_list_count, (int)size, 100.0 * size / total); if (stats.zns_leaf_count > 0) { size_t average = stats.zns_leaf_total / stats.zns_leaf_count; (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", stats.zns_leaf_count, (int)average); (void) printf("%24d bytes largest\n", (int)stats.zns_leaf_largest); if (dump_opt['l'] >= 3 && average > 0) (void) printf(" space for %d additional leaf vdevs\n", (int)((cap - total) / average)); } (void) printf("\n"); nvlist_free(stats.zns_string); nvlist_free(stats.zns_uint64); nvlist_free(stats.zns_boolean); } typedef struct cksum_record { zio_cksum_t cksum; boolean_t labels[VDEV_LABELS]; avl_node_t link; } cksum_record_t; static int cksum_record_compare(const void *x1, const void *x2) { const cksum_record_t *l = (cksum_record_t *)x1; const cksum_record_t *r = (cksum_record_t *)x2; int arraysize = ARRAY_SIZE(l->cksum.zc_word); int difference = 0; for (int i = 0; i < arraysize; i++) { difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } return (difference); } static cksum_record_t * cksum_record_alloc(zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); rec->cksum = *cksum; rec->labels[l] = B_TRUE; return (rec); } static cksum_record_t * cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) { cksum_record_t lookup = { .cksum = *cksum }; avl_index_t where; return (avl_find(tree, &lookup, &where)); } static cksum_record_t * cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = cksum_record_lookup(tree, cksum); if (rec) { rec->labels[l] = B_TRUE; } else { rec = cksum_record_alloc(cksum, l); avl_add(tree, rec); } return (rec); } static int first_label(cksum_record_t *rec) { for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i]) return (i); return (-1); } static void print_label_numbers(const char *prefix, const cksum_record_t *rec) { fputs(prefix, stdout); for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i] == B_TRUE) printf("%d ", i); putchar('\n'); } #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) typedef struct zdb_label { vdev_label_t label; uint64_t label_offset; nvlist_t *config_nv; cksum_record_t *config; cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; boolean_t header_printed; boolean_t read_failed; boolean_t cksum_valid; } zdb_label_t; static void print_label_header(zdb_label_t *label, int l) { if (dump_opt['q']) return; if (label->header_printed == B_TRUE) return; (void) printf("------------------------------------\n"); (void) printf("LABEL %d %s\n", l, label->cksum_valid ? "" : "(Bad label cksum)"); (void) printf("------------------------------------\n"); label->header_printed = B_TRUE; } static void print_l2arc_header(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device header\n"); (void) printf("------------------------------------\n"); } static void print_l2arc_log_blocks(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device log blocks\n"); (void) printf("------------------------------------\n"); } static void dump_l2arc_log_entries(uint64_t log_entries, l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " "vdev: %llu, offset: %llu\n", (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); (void) printf("|\t\t\t\tbirth: %llu\n", (u_longlong_t)le[j].le_birth); (void) printf("|\t\t\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tpsize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tcompr: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); (void) printf("|\t\t\t\tcomplevel: %llu\n", (u_longlong_t)(&le[j])->le_complevel); (void) printf("|\t\t\t\ttype: %llu\n", (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); (void) printf("|\t\t\t\tprotected: %llu\n", (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); (void) printf("|\t\t\t\tprefetch: %llu\n", (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); (void) printf("|\t\t\t\taddress: %llu\n", (u_longlong_t)le[j].le_daddr); (void) printf("|\t\t\t\tARC state: %llu\n", (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); (void) printf("|\n"); } (void) printf("\n"); } static void dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps) { (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr); (void) printf("|\t\tpayload_asize: %llu\n", (u_longlong_t)lbps->lbp_payload_asize); (void) printf("|\t\tpayload_start: %llu\n", (u_longlong_t)lbps->lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop)); (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop)); (void) printf("|\t\tcksumalgo: %llu\n", (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop)); (void) printf("|\n\n"); } static void dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; uint64_t asize; l2arc_log_blkptr_t lbps[2]; zio_cksum_t cksum; int failed = 0; l2arc_dev_t dev; if (!dump_opt['q']) print_l2arc_log_blocks(); memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr->dh_evict; dev.l2ad_start = l2dhdr->dh_start; dev.l2ad_end = l2dhdr->dh_end; if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ if (!dump_opt['q']) { (void) printf("No log blocks to read\n"); (void) printf("\n"); } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); } dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); for (;;) { if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { if (!dump_opt['q']) { (void) printf("Error while reading next log " "block\n\n"); } break; } fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; if (!dump_opt['q']) { (void) printf("Invalid cksum\n"); dump_l2arc_log_blkptr(&lbps[0]); } break; } switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { case ZIO_COMPRESS_OFF: break; default: { abd_t *abd = abd_alloc_linear(asize, B_TRUE); abd_copy_from_buf_off(abd, &this_lb, 0, asize); abd_t dabd; abd_get_from_buf_struct(&dabd, &this_lb, sizeof (this_lb)); int err = zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &dabd, asize, sizeof (this_lb), NULL); abd_free(&dabd); abd_free(abd); if (err != 0) { (void) printf("L2ARC block decompression " "failed\n"); goto out; } break; } } if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { if (!dump_opt['q']) (void) printf("Invalid log block magic\n\n"); break; } rebuild->dh_lb_count++; rebuild->dh_lb_asize += asize; if (dump_opt['l'] > 1 && !dump_opt['q']) { (void) printf("lb[%4llu]\tmagic: %llu\n", (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(&lbps[0]); } if (dump_opt['l'] > 2 && !dump_opt['q']) dump_l2arc_log_entries(l2dhdr->dh_log_entries, this_lb.lb_entries, rebuild->dh_lb_count); if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev.l2ad_evict) && !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } out: if (!dump_opt['q']) { (void) printf("log_blk_count:\t %llu with valid cksum\n", (u_longlong_t)rebuild->dh_lb_count); (void) printf("\t\t %d with invalid cksum\n", failed); (void) printf("log_blk_asize:\t %llu\n\n", (u_longlong_t)rebuild->dh_lb_asize); } } static int dump_l2arc_header(int fd) { l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0}; int error = B_FALSE; if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; } else { if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) error = B_TRUE; } if (error) { (void) printf("L2ARC device header not found\n\n"); /* Do not return an error here for backward compatibility */ return (0); } else if (!dump_opt['q']) { print_l2arc_header(); (void) printf(" magic: %llu\n", (u_longlong_t)l2dhdr.dh_magic); (void) printf(" version: %llu\n", (u_longlong_t)l2dhdr.dh_version); (void) printf(" pool_guid: %llu\n", (u_longlong_t)l2dhdr.dh_spa_guid); (void) printf(" flags: %llu\n", (u_longlong_t)l2dhdr.dh_flags); (void) printf(" start_lbps[0]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[0].lbp_daddr); (void) printf(" start_lbps[1]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); (void) printf(" lb_asize_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_asize); (void) printf(" lb_count_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_count); (void) printf(" trim_action_time: %llu\n", (u_longlong_t)l2dhdr.dh_trim_action_time); (void) printf(" trim_state: %llu\n\n", (u_longlong_t)l2dhdr.dh_trim_state); } dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild); /* * The total aligned size of log blocks and the number of log blocks * reported in the header of the device may be less than what zdb * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). * This happens because dump_l2arc_log_blocks() lacks the memory * pressure valve that l2arc_rebuild() has. Thus, if we are on a system * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize * and dh_lb_count will be lower to begin with than what exists on the * device. This is normal and zdb should not exit with an error. The * opposite case should never happen though, the values reported in the * header should never be higher than what dump_l2arc_log_blocks() and * l2arc_rebuild() report. If this happens there is a leak in the * accounting of log blocks. */ if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || l2dhdr.dh_lb_count > rebuild.dh_lb_count) return (1); return (0); } static void dump_config_from_label(zdb_label_t *label, size_t buflen, int l) { if (dump_opt['q']) return; if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) return; print_label_header(label, l); dump_nvlist(label->config_nv, 4); print_label_numbers(" labels = ", label->config); if (dump_opt['l'] >= 2) dump_nvlist_stats(label->config_nv, buflen); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) { vdev_t vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)&label->label + uoff); cksum_record_t *rec = label->uberblocks[i]; if (rec == NULL) { if (dump_opt['u'] >= 2) { print_label_header(label, label_num); (void) printf(" Uberblock[%d] invalid\n", i); } continue; } if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) continue; if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) continue; print_label_header(label, label_num); (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, " Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); print_label_numbers(" labels = ", rec); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) { int err; boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1, retobj)); zfs_fallthrough; case DMU_OT_PLAIN_FILE_CONTENTS: if (retobj != NULL) { *retobj = child_obj; } else { dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); } return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path, uint64_t *retobj) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); close_objset(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path, retobj); close_objset(os, FTAG); return (err); } static int dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) { const char *p = (const char *)buf; ssize_t nwritten; (void) os; (void) arg; /* Write the data out, handling short writes and signals. */ while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { if (nwritten < 0) { if (errno == EINTR) continue; return (errno); } p += nwritten; len -= nwritten; } return (0); } static void dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) { boolean_t embed = B_FALSE; boolean_t large_block = B_FALSE; boolean_t compress = B_FALSE; boolean_t raw = B_FALSE; const char *c; for (c = flagstr; c != NULL && *c != '\0'; c++) { switch (*c) { case 'e': embed = B_TRUE; break; case 'L': large_block = B_TRUE; break; case 'c': compress = B_TRUE; break; case 'w': raw = B_TRUE; break; default: fprintf(stderr, "dump_backup: invalid flag " "'%c'\n", *c); return; } } if (isatty(STDOUT_FILENO)) { fprintf(stderr, "dump_backup: stream cannot be written " "to a terminal\n"); return; } offset_t off = 0; dmu_send_outparams_t out = { .dso_outfunc = dump_backup_bytes, .dso_dryrun = B_FALSE, }; int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, &off, &out); if (err != 0) { fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", strerror(err)); return; } } static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { int err = 0; uint64_t size, readsize, oursize, offset; ssize_t writesize; sa_handle_t *hdl; (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, destfile); VERIFY3P(os, ==, sa_os); if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { (void) printf("Failed to get handle for SA znode\n"); return (err); } if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { (void) sa_handle_destroy(hdl); return (err); } (void) sa_handle_destroy(hdl); (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, size); if (size == 0) { return (EINVAL); } int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd == -1) return (errno); /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); offset = 0; char *buf = kmem_alloc(oursize, KM_NOSLEEP); if (buf == NULL) { (void) close(fd); return (ENOMEM); } while (offset < size) { readsize = MIN(size - offset, 1 << 20); err = dmu_read(os, srcobj, offset, readsize, buf, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(buf, oursize); (void) close(fd); return (err); } if (dump_opt['v'] > 3) { (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 " error=%d\n", offset, readsize, err); } writesize = write(fd, buf, readsize); if (writesize < 0) { err = errno; break; } else if (writesize != readsize) { /* Incomplete write */ (void) fprintf(stderr, "Short write, only wrote %llu of" " %" PRIu64 " bytes, exiting...\n", (u_longlong_t)writesize, readsize); break; } offset += readsize; } (void) close(fd); if (buf != NULL) kmem_free(buf, oursize); return (err); } static boolean_t label_cksum_valid(vdev_label_t *label, uint64_t offset) { zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; zio_cksum_t expected_cksum; zio_cksum_t actual_cksum; zio_cksum_t verifier; zio_eck_t *eck; int byteswap; void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; offset += offsetof(vdev_label_t, vl_vdev_phys); ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); abd_free(abd); if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (B_TRUE); return (B_FALSE); } static int dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; uint64_t psize, ashift, l2cache; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; boolean_t read_l2arc_header = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; int fd; /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(path, dev, sizeof (path)); if (dev[0] != '/' && stat64(path, &statbuf) != 0) { int error; error = zfs_resolve_shortname(dev, path, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(path)) { if (zfs_append_partition(path, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat64(path, &statbuf) != 0)) { (void) printf("failed to find device %s, try " "specifying absolute path instead\n", dev); return (1); } } if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); zdb_exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); zdb_exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) (void) printf("failed to invalidate cache '%s' : %s\n", path, strerror(errno)); avl_create(&config_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); avl_create(&uberblock_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; /* * 1. Read the label from disk * 2. Verify label cksum * 3. Unpack the configuration and insert in config tree. * 4. Traverse all uberblocks and insert in uberblock tree. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; char *buf = label->label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); nvlist_t *config; cksum_record_t *rec; zio_cksum_t cksum; vdev_t vd; label->label_offset = vdev_label_offset(psize, l, 0); if (pread64(fd, &label->label, sizeof (label->label), label->label_offset) != sizeof (label->label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); label->read_failed = B_TRUE; error = B_TRUE; continue; } label->read_failed = B_FALSE; label->cksum_valid = label_cksum_valid(&label->label, label->label_offset); if (nvlist_unpack(buf, buflen, &config, 0) == 0) { nvlist_t *vdev_tree = NULL; size_t size; if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; /* If the device is a cache device read the header. */ if (!read_l2arc_header) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && l2cache == POOL_STATE_L2CACHE) { read_l2arc_header = B_TRUE; } } fletcher_4_native_varsize(buf, size, &cksum); rec = cksum_record_insert(&config_tree, &cksum, l); label->config = rec; label->config_nv = config; config_found = B_TRUE; } else { error = B_TRUE; } vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)label + uoff); if (uberblock_verify(ub)) continue; fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); rec = cksum_record_insert(&uberblock_tree, &cksum, l); label->uberblocks[i] = rec; } } /* * Dump the label and uberblocks. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); if (label->read_failed == B_TRUE) continue; if (label->config_nv) { dump_config_from_label(label, buflen, l); } else { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } if (dump_opt['u']) dump_label_uberblocks(label, ashift, l); nvlist_free(label->config_nv); } /* * Dump the L2ARC header, if existent. */ if (read_l2arc_header) error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); cookie = NULL; while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); avl_destroy(&config_tree); avl_destroy(&uberblock_tree); (void) close(fd); return (config_found == B_FALSE ? 2 : (error == B_TRUE ? 1 : 0)); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; static int dump_one_objset(const char *dsname, void *arg) { (void) arg; int error; objset_t *os; spa_feature_t f; error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); for (f = 0; f < SPA_FEATURES; f++) { if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } for (dsl_bookmark_node_t *dbn = avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); if (dbn->dbn_phys.zbm_redaction_obj != 0) { global_feature_count[ SPA_FEATURE_REDACTION_BOOKMARKS]++; objset_t *mos = os->os_spa->spa_meta_objset; dnode_t *rl; VERIFY0(dnode_hold(mos, dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); if (rl->dn_have_spill) { global_feature_count[ SPA_FEATURE_REDACTION_LIST_SPILL]++; } } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { global_feature_count[SPA_FEATURE_LIVELIST]++; } dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) typedef struct zdb_brt_entry { dva_t zbre_dva; uint64_t zbre_refcount; avl_node_t zbre_node; } zdb_brt_entry_t; typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_clone_asize; uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; uint64_t zcb_psize_total; uint64_t zcb_lsize_total; uint64_t zcb_asize_total; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; avl_tree_t zcb_brt; boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ static boolean_t same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) { vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t ms_shift = vd->vdev_ms_shift; return ((off1 >> ms_shift) == (off2 >> ms_shift)); } /* * Used to simplify reporting of the histogram data. */ typedef struct one_histo { const char *name; uint64_t *count; uint64_t *len; uint64_t cumulative; } one_histo_t; /* * The number of separate histograms processed for psize, lsize and asize. */ #define NUM_HISTO 3 /* * This routine will create a fixed column size output of three different * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M * the count, length and cumulative length of the psize, lsize and * asize blocks. * * All three types of blocks are listed on a single line * * By default the table is printed in nicenumber format (e.g. 123K) but * if the '-P' parameter is specified then the full raw number (parseable) * is printed out. */ static void dump_size_histograms(zdb_cb_t *zcb) { /* * A temporary buffer that allows us to convert a number into * a string using zdb_nicenumber to allow either raw or human * readable numbers to be output. */ char numbuf[32]; /* * Define titles which are used in the headers of the tables * printed by this routine. */ const char blocksize_title1[] = "block"; const char blocksize_title2[] = "size"; const char count_title[] = "Count"; const char length_title[] = "Size"; const char cumulative_title[] = "Cum."; /* * Setup the histogram arrays (psize, lsize, and asize). */ one_histo_t parm_histo[NUM_HISTO]; parm_histo[0].name = "psize"; parm_histo[0].count = zcb->zcb_psize_count; parm_histo[0].len = zcb->zcb_psize_len; parm_histo[0].cumulative = 0; parm_histo[1].name = "lsize"; parm_histo[1].count = zcb->zcb_lsize_count; parm_histo[1].len = zcb->zcb_lsize_len; parm_histo[1].cumulative = 0; parm_histo[2].name = "asize"; parm_histo[2].count = zcb->zcb_asize_count; parm_histo[2].len = zcb->zcb_asize_len; parm_histo[2].cumulative = 0; (void) printf("\nBlock Size Histogram\n"); /* * Print the first line titles */ if (dump_opt['P']) (void) printf("\n%s\t", blocksize_title1); else (void) printf("\n%7s ", blocksize_title1); for (int j = 0; j < NUM_HISTO; j++) { if (dump_opt['P']) { if (j < NUM_HISTO - 1) { (void) printf("%s\t\t\t", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf(" %s", parm_histo[j].name); } } else { if (j < NUM_HISTO - 1) { /* Left aligned strings in the output */ (void) printf("%-7s ", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf("%s", parm_histo[j].name); } } } (void) printf("\n"); /* * Print the second line titles */ if (dump_opt['P']) { (void) printf("%s\t", blocksize_title2); } else { (void) printf("%7s ", blocksize_title2); } for (int i = 0; i < NUM_HISTO; i++) { if (dump_opt['P']) { (void) printf("%s\t%s\t%s\t", count_title, length_title, cumulative_title); } else { (void) printf("%7s%7s%7s", count_title, length_title, cumulative_title); } } (void) printf("\n"); /* * Print the rows */ for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { /* * Print the first column showing the blocksize */ zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); if (dump_opt['P']) { printf("%s", numbuf); } else { printf("%7s:", numbuf); } /* * Print the remaining set of 3 columns per size: * for psize, lsize and asize */ for (int j = 0; j < NUM_HISTO; j++) { parm_histo[j].cumulative += parm_histo[j].len[i]; zdb_nicenum(parm_histo[j].count[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].len[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].cumulative, numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); } (void) printf("\n"); } } static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { int i; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; /* * This flag controls if we will issue a claim for the block while * counting it, to ensure that all blocks are referenced in space maps. * We don't issue claims if we're not doing leak tracking, because it's * expensive if the user isn't interested. We also don't claim the * second or later occurences of cloned or dedup'd blocks, because we * already claimed them the first time. */ boolean_t do_claim = !dump_opt['L']; spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); blkptr_t tempbp; if (BP_GET_DEDUP(bp)) { /* * Dedup'd blocks are special. We need to count them, so we can * later uncount them when reporting leaked space, and we must * only claim them once. * * We use the existing dedup system to track what we've seen. * The first time we see a block, we do a ddt_lookup() to see * if it exists in the DDT. If we're doing leak tracking, we * claim the block at this time. * * Each time we see a block, we reduce the refcount in the * entry by one, and add to the size and count of dedup'd * blocks to report at the end. */ ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); /* * Find the block. This will create the entry in memory, but * we'll know if that happened by its refcount. */ ddt_entry_t *dde = ddt_lookup(ddt, bp); /* * ddt_lookup() can return NULL if this block didn't exist * in the DDT and creating it would take the DDT over its * quota. Since we got the block from disk, it must exist in * the DDT, so this can't happen. However, when unique entries * are pruned, the dedup bit can be set with no corresponding * entry in the DDT. */ if (dde == NULL) { ddt_exit(ddt); goto skipped; } /* Get the phys for this variant */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); /* * This entry may have multiple sets of DVAs. We must claim * each set the first time we see them in a real block on disk, * or count them on subsequent occurences. We don't have a * convenient way to track the first time we see each variant, * so we repurpose dde_io as a set of "seen" flag bits. We can * do this safely in zdb because it never writes, so it will * never have a writing zio for this block in that pointer. */ boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v)); if (!seen) dde->dde_io = (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) ddt_phys_decref(dde->dde_phys, v); /* * If this entry has a single flat phys, it may have been * extended with additional DVAs at some time in its life. * This block might be from before it was fully extended, and * so have fewer DVAs. * * If this is the first time we've seen this block, and we * claimed it as-is, then we would miss the claim on some * number of DVAs, which would then be seen as leaked. * * In all cases, if we've had fewer DVAs, then the asize would * be too small, and would lead to the pool apparently using * more space than allocated. * * To handle this, we copy the canonical set of DVAs from the * entry back to the block pointer before we claim it. */ if (v == DDT_PHYS_FLAT) { ASSERT3U(BP_GET_BIRTH(bp), ==, ddt_phys_birth(dde->dde_phys, v)); tempbp = *bp; ddt_bp_fill(dde->dde_phys, v, &tempbp, BP_GET_BIRTH(bp)); bp = &tempbp; } if (seen) { /* * The second or later time we see this block, * it's a duplicate and we count it. */ zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); zcb->zcb_dedup_blocks++; /* Already claimed, don't do it again. */ do_claim = B_FALSE; } ddt_exit(ddt); } else if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { /* * Cloned blocks are special. We need to count them, so we can * later uncount them when reporting leaked space, and we must * only claim them once. * * To do this, we keep our own in-memory BRT. For each block * we haven't seen before, we look it up in the real BRT and * if its there, we note it and its refcount then proceed as * normal. If we see the block again, we count it as a clone * and then give it no further consideration. */ zdb_brt_entry_t zbre_search, *zbre; avl_index_t where; zbre_search.zbre_dva = bp->blk_dva[0]; zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); if (zbre == NULL) { /* Not seen before; track it */ uint64_t refcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); if (refcnt > 0) { zbre = umem_zalloc(sizeof (zdb_brt_entry_t), UMEM_NOFAIL); zbre->zbre_dva = bp->blk_dva[0]; zbre->zbre_refcount = refcnt; avl_insert(&zcb->zcb_brt, zbre, where); } } else { /* * Second or later occurrence, count it and take a * refcount. */ zcb->zcb_clone_asize += BP_GET_ASIZE(bp); zcb->zcb_clone_blocks++; zbre->zbre_refcount--; if (zbre->zbre_refcount == 0) { avl_remove(&zcb->zcb_brt, zbre); umem_free(zbre, sizeof (zdb_brt_entry_t)); } /* Already claimed, don't do it again. */ do_claim = B_FALSE; } } skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; if (same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) { zb->zb_ditto_samevdev++; if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; } break; } } spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } /* * The binning histogram bins by powers of two up to * SPA_MAXBLOCKSIZE rather than creating bins for * every possible blocksize found in the pool. */ int bin = highbit64(BP_GET_PSIZE(bp)) - 1; zcb->zcb_psize_count[bin]++; zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); zcb->zcb_psize_total += BP_GET_PSIZE(bp); bin = highbit64(BP_GET_LSIZE(bp)) - 1; zcb->zcb_lsize_count[bin]++; zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); zcb->zcb_lsize_total += BP_GET_LSIZE(bp); bin = highbit64(BP_GET_ASIZE(bp)) - 1; zcb->zcb_asize_count[bin]++; zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); if (!do_claim) return; VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL))); } static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); abd_free(zio->io_abd); } static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes > max_inflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; uint64_t kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); uint64_t sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated"); zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4"PRIu64"MB/s) " "estimated time remaining: " "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static int load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { spa_vdev_removal_t *svr = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; /* skip vdevs we don't care about */ if (sme->sme_vdev != svr->svr_vdev_id) return (0); vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) zfs_range_tree_add(svr->svr_allocd_segs, offset, size); else zfs_range_tree_remove(svr->svr_allocd_segs, offset, size); return (0); } static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset, (void) arg; /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return; if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT0(zfs_range_tree_space(allocs)); if (msp->ms_sm != NULL) VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); zfs_range_tree_vacate(allocs, zfs_range_tree_add, svr->svr_allocd_segs); } zfs_range_tree_destroy(allocs); iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); /* * Clear everything past what has been synced, * because we have not allocated mappings for * it yet. */ zfs_range_tree_clear(svr->svr_allocd_segs, vdev_indirect_mapping_max_offset(vim), vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs); zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) tx; zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint64_t obsolete_sm_object; uint32_t *counts; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static int count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { int64_t *ualloc_space = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) *ualloc_space += sme->sme_run; else *ualloc_space -= sme->sme_run; return (0); } static int64_t get_unflushed_alloc_space(spa_t *spa) { if (dump_opt['L']) return (0); int64_t ualloc_space = 0; iterate_through_spacemap_logs(spa, count_unflushed_space_cb, &ualloc_space); return (ualloc_space); } static int load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { maptype_t *uic_maptype = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (*uic_maptype == sme->sme_type) zfs_range_tree_add(ms->ms_allocatable, offset, size); else zfs_range_tree_remove(ms->ms_allocatable, offset, size); return (0); } static void load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) { iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } load_unflushed_to_ms_allocatables(spa, maptype); } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (dump_opt['L']) return; dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); /* * For leak detection, we overload the ms_allocatable trees * to contain allocated segments instead of free segments. * As a result, we can't use the normal metaslab_load/unload * interfaces. */ zdb_leak_init_prepare_indirect_vdevs(spa, zcb); load_concrete_ms_allocatable_trees(spa, SM_ALLOC); /* * On load_concrete_ms_allocatable_trees() we loaded all the * allocated entries from the ms_sm to the ms_allocatable for * each metaslab. If the pool has a checkpoint or is in the * middle of discarding a checkpoint, some of these blocks * may have been freed but their ms_sm may not have been * updated because they are referenced by the checkpoint. In * order to avoid false-positives during leak-detection, we * go through the vdev's checkpoint space map and exclude all * its entries from their relevant ms_allocatable. * * We also aggregate the space held by the checkpoint and add * it to zcb_checkpoint_size. * * Note that at this point we are also verifying that all the * entries on the checkpoint_sm are marked as allocated in * the ms_sm of their relevant metaslab. * [see comment in checkpoint_sm_exclude_entry_cb()] */ zdb_leak_init_exclude_checkpoint(spa, zcb); ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; boolean_t are_precise = B_FALSE; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1ULL << vd->vdev_ashift) { if (zfs_range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1ULL << vd->vdev_ashift)) { obsolete_bytes += 1ULL << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (!are_precise && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return (B_FALSE); boolean_t leaks = B_FALSE; vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == spa_embedded_log_class(spa)) ? vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded * to contain allocated segments. Now that * we finished traversing all blocks, any * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_allocatable. For indirect * vdevs, space remaining in the tree * represents parts of the mapping that are * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { zfs_range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } } } umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; return (leaks); } static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { (void) tx; zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } /* * Iterate over livelists which have been destroyed by the user but * are still present in the MOS, waiting to be freed */ static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) { objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; ASSERT0(err); zap_cursor_t zc; zap_attribute_t *attrp = zap_attribute_alloc(); dsl_deadlist_t ll; /* NULL out os prior to dsl_deadlist_open in case it's garbage */ ll.dl_os = NULL; for (zap_cursor_init(&zc, mos, zap_obj); zap_cursor_retrieve(&zc, attrp) == 0; (void) zap_cursor_advance(&zc)) { VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer)); func(&ll, arg); dsl_deadlist_close(&ll); } zap_cursor_fini(&zc); zap_attribute_free(attrp); } static int bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (count_block_cb(arg, bp, tx)); } static int livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) { zdb_cb_t *zbc = args; bplist_t blks; bplist_create(&blks); /* determine which blocks have been alloc'd but not freed */ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); /* count those blocks */ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); bplist_destroy(&blks); return (0); } static void livelist_count_blocks(dsl_deadlist_t *ll, void *arg) { dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); } /* * Count the blocks in the livelists that have been destroyed by the user * but haven't yet been freed. */ static void deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) { iterate_deleted_livelists(spa, livelist_count_blocks, zbc); } static void dump_livelist_cb(dsl_deadlist_t *ll, void *arg) { ASSERT3P(arg, ==, NULL); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); } /* * Print out, register object references to, and increment feature counts for * livelists that have been destroyed by the user but haven't yet been freed. */ static void deleted_livelists_dump_mos(spa_t *spa) { uint64_t zap_obj; objset_t *mos = spa->spa_meta_objset; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; mos_obj_refd(zap_obj); iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } static int zdb_brt_entry_compare(const void *zcn1, const void *zcn2) { const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; int cmp; cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (cmp == 0) cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); return (cmp); } static int dump_block_stats(spa_t *spa) { zdb_cb_t *zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; boolean_t leaks = B_FALSE; int e, c, err; bp_embedded_type_t i; ddt_prefetch_all(spa); zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, sizeof (zdb_brt_entry_t), offsetof(zdb_brt_entry_t, zbre_node)); zcb->zcb_brt_is_active = B_TRUE; } (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * When leak detection is enabled we load all space maps as SM_ALLOC * maps, then traverse the pool claiming each block we discover. If * the pool is perfectly consistent, the segment trees will be empty * when we're done. Anything left over is a leak; any block we can't * claim (because it's not part of any space map) is a double * allocation, reference to a freed block, or an unclaimed log block. * * When leak detection is disabled (-L option) we still traverse the * pool claiming each block we discover, but we skip opening any space * maps. */ zdb_leak_init(spa, zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, bpobj_count_block_cb, zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, bpobj_count_block_cb, zcb, NULL); } zdb_claim_removing(spa, zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, zcb, NULL)); } deleted_livelists_count_blocks(spa, zcb); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (c = 0; c < max_ncpus; c++) { (void) zio_wait(spa->spa_async_zio_root[c]); spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } ASSERT0(spa->spa_load_verify_bytes); /* * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ zcb->zcb_haderrors |= err; if (zcb->zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (e = 0; e < 256; e++) { if (zcb->zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb->zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, zcb); tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); } if (tzb->zb_count == 0) { umem_free(zcb, sizeof (zdb_cb_t)); return (2); } (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu count: %6llu\n", "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( spa_special_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Special class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( spa_dedup_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Dedup class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_embedded_log_class(spa)); uint64_t space = metaslab_class_get_space( spa_embedded_log_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Embedded log class", (u_longlong_t)alloc, 100.0 * alloc / space); } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb->zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb->zcb_embedded_histogram[i], sizeof (zcb->zcb_embedded_histogram[i]) / sizeof (zcb->zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (tzb->zb_ditto_same_ms != 0) { (void) printf("\tDittoed blocks in same metaslab: %llu\n", (longlong_t)tzb->zb_ditto_same_ms); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), UMEM_NOFAIL); for (t = 0; t <= ZDB_OT_TOTAL; t++) { const char *typename; /* make sure nicenum has enough space */ _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ, "csize truncated"); _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ, "psize truncated"); _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ, "avg truncated"); _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ, "gang truncated"); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb->zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && (level > 0 || DMU_OT_IS_METADATA(t))) { mdstats->zb_count += zb->zb_count; mdstats->zb_lsize += zb->zb_lsize; mdstats->zb_psize += zb->zb_psize; mdstats->zb_asize += zb->zb_asize; mdstats->zb_gangs += zb->zb_gangs; } if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb->zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } zdb_nicenum(mdstats->zb_count, csize, sizeof (csize)); zdb_nicenum(mdstats->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(mdstats->zb_psize, psize, sizeof (psize)); zdb_nicenum(mdstats->zb_asize, asize, sizeof (asize)); zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, sizeof (avg)); zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)mdstats->zb_lsize / mdstats->zb_psize, 100.0 * mdstats->zb_asize / tzb->zb_asize); (void) printf("%s\n", "Metadata Total"); /* Output a table summarizing block sizes in the pool */ if (dump_opt['b'] >= 2) { dump_size_histograms(zcb); } umem_free(mdstats, sizeof (zfs_blkstat_t)); } (void) printf("\n"); if (leaks) { umem_free(zcb, sizeof (zdb_cb_t)); return (2); } if (zcb->zcb_haderrors) { umem_free(zcb, sizeof (zdb_cb_t)); return (3); } umem_free(zcb, sizeof (zdb_cb_t)); return (0); } typedef struct zdb_ddt_entry { /* key must be first for ddt_key_compare */ ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { (void) zilog, (void) dnp; avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; avl_create(&t, ddt_key_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1]; dds->dds_blocks += zdde->zdde_ref_blocks / refcnt; dds->dds_lsize += zdde->zdde_ref_lsize / refcnt; dds->dds_psize += zdde->zdde_ref_psize / refcnt; dds->dds_dsize += zdde->zdde_ref_dsize / refcnt; dds->dds_ref_blocks += zdde->zdde_ref_blocks; dds->dds_ref_lsize += zdde->zdde_ref_lsize; dds->dds_ref_psize += zdde->zdde_ref_psize; dds->dds_ref_dsize += zdde->zdde_ref_dsize; umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_total(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } static void zdb_set_skip_mmp(char *target) { spa_t *spa; /* * Disable the activity check to allow examination of * active pools. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL) { spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; } mutex_exit(&spa_namespace_lock); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appended to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; boolean_t freecfg = B_FALSE; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { zdb_set_skip_mmp(poolname); error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } freecfg = B_TRUE; } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) { if (target != poolname) free(poolname); return (NULL); } fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); if (freecfg) nvlist_free(cfg); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) { if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { free(bogus_name); if (path_start != NULL) free(poolname); return (NULL); } } if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); zfs_range_tree_verify_not_present(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); if (dump_opt['m'] > 3) dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ zfs_range_tree_walk(ckpoint_msp->ms_allocatable, (zfs_range_tree_func_t *) zfs_range_tree_verify_not_present, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { ASSERT(!dump_opt['L']); spa_t *checkpoint_spa; char *checkpoint_pool; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); if (dump_opt['m'] > 3) dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { (void) arg; for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) zfs_range_tree_add(mos_refd_objs, obj, 1); } /* * Call on a MOS object that may already have been referenced. */ static void mos_obj_refd_multiple(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL && !zfs_range_tree_contains(mos_refd_objs, obj, 1)) zfs_range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev_top_zap(vdev_t *vd) { uint64_t ms_flush_data_obj; int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(ms_flush_data_obj); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } if (vd->vdev_root_zap != 0) mos_obj_refd(vd->vdev_root_zap); if (vd->vdev_top_zap != 0) { mos_obj_refd(vd->vdev_top_zap); mos_leak_vdev_top_zap(vd); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static void mos_leak_log_spacemaps(spa_t *spa) { uint64_t spacemap_zap; int error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(spacemap_zap); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) mos_obj_refd(sls->sls_sm_obj); } static void errorlog_count_refd(objset_t *mos, uint64_t errlog) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, errlog); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { mos_obj_refd(za->za_first_integer); } zap_cursor_fini(&zc); zap_attribute_free(za); } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { errorlog_count_refd(mos, spa->spa_errlog_last); errorlog_count_refd(mos, spa->spa_errlog_scrub); } mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); if (spa->spa_syncing_log_sm != NULL) mos_obj_refd(spa->spa_syncing_log_sm->sm_object); mos_leak_log_spacemaps(spa); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; /* DDT store objects */ for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } /* FDT container */ if (ddt->ddt_version == DDT_VERSION_FDT) mos_obj_refd(ddt->ddt_dir_object); /* FDT log objects */ if (ddt->ddt_flags & DDT_FLAG_LOG) { mos_obj_refd(ddt->ddt_log[0].ddl_object); mos_obj_refd(ddt->ddt_log[1].ddl_object); } } for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (brtvd->bv_initiated) { mos_obj_refd(brtvd->bv_mos_brtvdev); mos_obj_refd(brtvd->bv_mos_entries); } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (zfs_range_tree_contains(mos_refd_objs, object, 1)) { zfs_range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; VERIFY0(dmu_object_info(mos, object, &doi)); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!zfs_range_tree_is_empty(mos_refd_objs)) rv = 2; zfs_range_tree_vacate(mos_refd_objs, NULL, NULL); zfs_range_tree_destroy(mos_refd_objs); return (rv); } typedef struct log_sm_obsolete_stats_arg { uint64_t lsos_current_txg; uint64_t lsos_total_entries; uint64_t lsos_valid_entries; uint64_t lsos_sm_entries; uint64_t lsos_valid_sm_entries; } log_sm_obsolete_stats_arg_t; static int log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { log_sm_obsolete_stats_arg_t *lsos = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; if (lsos->lsos_current_txg == 0) { /* this is the first log */ lsos->lsos_current_txg = txg; } else if (lsos->lsos_current_txg < txg) { /* we just changed log - print stats and reset */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos->lsos_valid_sm_entries, (u_longlong_t)lsos->lsos_sm_entries, (u_longlong_t)lsos->lsos_current_txg); lsos->lsos_valid_sm_entries = 0; lsos->lsos_sm_entries = 0; lsos->lsos_current_txg = txg; } ASSERT3U(lsos->lsos_current_txg, ==, txg); lsos->lsos_sm_entries++; lsos->lsos_total_entries++; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); lsos->lsos_valid_sm_entries++; lsos->lsos_valid_entries++; return (0); } static void dump_log_spacemap_obsolete_stats(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; log_sm_obsolete_stats_arg_t lsos = {0}; (void) printf("Log Space Map Obsolete Entry Statistics:\n"); iterate_through_spacemap_logs(spa, log_spacemap_obsolete_stats_cb, &lsos); /* print stats for latest log */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos.lsos_valid_sm_entries, (u_longlong_t)lsos.lsos_sm_entries, (u_longlong_t)lsos.lsos_current_txg); (void) printf("%-8llu valid entries out of %-8llu - total\n\n", (u_longlong_t)lsos.lsos_valid_entries, (u_longlong_t)lsos.lsos_total_entries); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['y']) { livelist_metaslab_validate(spa); } if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['T']) dump_brt(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa, dump_opt['M'] > 1); if (dump_opt['d'] > 2 || dump_opt['m']) { dump_log_spacemaps(spa); dump_log_spacemap_obsolete_stats(spa); } if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; uint64_t *arr; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { if (global_feature_count[f] == UINT64_MAX) continue; if (!spa_feature_is_enabled(spa, f)) { ASSERT0(global_feature_count[f]); continue; } arr = global_feature_count; } else { if (!spa_feature_is_enabled(spa, f)) { ASSERT0(dataset_feature_count[f]); continue; } arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) rc = verify_device_removal_feature_counts(spa); } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); zdb_exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_RAW 0x0020 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 #define ZDB_FLAG_VERBOSE 0x0080 static int flagbits[256]; static char flagbitstr[16]; static void zdb_print_blkptr(const blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); VERIFY(write(fileno(stdout), buf, size) == size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); #ifdef _ZFS_LITTLE_ENDIAN /* correct the endianness */ do_bswap = !do_bswap; #endif for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the hierarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (s && *s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } static int name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) { dsl_dataset_t *ds; dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, NULL, &ds); if (error != 0) { (void) fprintf(stderr, "failed to hold objset %llu: %s\n", (u_longlong_t)objset_id, strerror(error)); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (error); } dsl_dataset_name(ds, outstr); dsl_dataset_rele(ds, NULL); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (0); } static boolean_t zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) { char *s0, *s1, *tmp = NULL; if (sizes == NULL) return (B_FALSE); s0 = strtok_r(sizes, "/", &tmp); if (s0 == NULL) return (B_FALSE); s1 = strtok_r(NULL, "/", &tmp); *lsize = strtoull(s0, NULL, 16); *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; return (*lsize >= *psize && *psize > 0); } #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) static boolean_t try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize, int flags, int cfunc, void *lbuf, void *lbuf2) { if (flags & ZDB_FLAG_VERBOSE) { (void) fprintf(stderr, "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[cfunc].ci_name); } /* * We set lbuf to all zeros and lbuf2 to all * ones, then decompress to both buffers and * compare their contents. This way we can * know if decompression filled exactly to * lsize or if it left some bytes unwritten. */ memset(lbuf, 0x00, lsize); memset(lbuf2, 0xff, lsize); abd_t labd, labd2; abd_get_from_buf_struct(&labd, lbuf, lsize); abd_get_from_buf_struct(&labd2, lbuf2, lsize); boolean_t ret = B_FALSE; if (zio_decompress_data(cfunc, pabd, &labd, psize, lsize, NULL) == 0 && zio_decompress_data(cfunc, pabd, &labd2, psize, lsize, NULL) == 0 && memcmp(lbuf, lbuf2, lsize) == 0) ret = B_TRUE; abd_free(&labd2); abd_free(&labd); return (ret); } static uint64_t zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, uint64_t psize, int flags) { (void) buf; uint64_t orig_lsize = lsize; boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); boolean_t found = B_FALSE; /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; int *cfuncp = cfuncs; uint64_t maxlsize = SPA_MAXBLOCKSIZE; uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | ZIO_COMPRESS_MASK(ZLE); *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); /* * Every gzip level has the same decompressor, no need to * run it 9 times per bruteforce attempt. */ mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3); mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5); mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7); mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9); for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) if (((1ULL << c) & mask) == 0) *cfuncp++ = c; /* * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this * could take a while and we should let the user know * we are not stuck. On the other hand, printing progress * info gets old after a while. User can specify 'v' flag * to see the progression. */ if (lsize == psize) lsize += SPA_MINBLOCKSIZE; else maxlsize = lsize; for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { for (cfuncp = cfuncs; *cfuncp; cfuncp++) { if (try_decompress_block(pabd, lsize, psize, flags, *cfuncp, lbuf, lbuf2)) { found = B_TRUE; break; } } if (*cfuncp != 0) break; } if (!found && tryzle) { for (lsize = orig_lsize; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { if (try_decompress_block(pabd, lsize, psize, flags, ZIO_COMPRESS_ZLE, lbuf, lbuf2)) { *cfuncp = ZIO_COMPRESS_ZLE; found = B_TRUE; break; } } } umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (*cfuncp == ZIO_COMPRESS_ZLE) { printf("\nZLE decompression was selected. If you " "suspect the results are wrong,\ntry avoiding ZLE " "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); } return (lsize > maxlsize ? -1 : lsize); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:[lsize/]psize[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * r: Dump raw data to stdout * v: Verbose * */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL; const char *vdev, *errmsg = NULL; int i, len, error; boolean_t borrowed = B_FALSE, found = B_FALSE; dup = strdup(thing); s = strtok_r(dup, ":", &tmp); vdev = s ?: ""; s = strtok_r(NULL, ":", &tmp); offset = strtoull(s ? s : "", NULL, 16); sizes = strtok_r(NULL, ":", &tmp); s = strtok_r(NULL, ":", &tmp); flagstr = strdup(s ?: ""); if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) errmsg = "invalid size(s)"; if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) errmsg = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) errmsg = "offset must be a multiple of sector size"; if (errmsg) { (void) printf("Invalid block specifier: %s - %s\n", thing, errmsg); goto done; } tmp = NULL; for (s = strtok_r(flagstr, ":", &tmp); s != NULL; s = strtok_r(NULL, ":", &tmp)) { len = strlen(flagstr); for (i = 0; i < len; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Ignoring flag: %c\n", (uchar_t)flagstr[i]); continue; } found = B_TRUE; flags |= bit; p = &flagstr[i + 1]; if (*p != ':' && *p != '\0') { int j = 0, nextbit = flagbits[(uchar_t)*p]; char *end, offstr[8] = { 0 }; if ((bit == ZDB_FLAG_PRINT_BLKPTR) && (nextbit == 0)) { /* look ahead to isolate the offset */ while (nextbit == 0 && strchr(flagbitstr, *p) == NULL) { offstr[j] = *p; j++; if (i + j > strlen(flagstr)) break; p++; nextbit = flagbits[(uchar_t)*p]; } blkptr_offset = strtoull(offstr, &end, 16); i += j; } else if (nextbit == 0) { (void) printf("***Ignoring flag arg:" " '%c'\n", (uchar_t)*p); } } } } if (blkptr_offset % sizeof (blkptr_t)) { printf("Block pointer offset 0x%llx " "must be divisible by 0x%x\n", (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); goto done; } if (found == B_FALSE && strlen(flagstr) > 0) { printf("Invalid flag arg: '%s'\n", flagstr); goto done; } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); goto done; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } uint64_t orig_lsize = lsize; buf = lbuf; if (flags & ZDB_FLAG_DECOMPRESS) { lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); if (lsize == -1) { (void) printf("Decompress of %s failed\n", thing); goto out; } } else { buf = abd_borrow_buf_copy(pabd, lsize); borrowed = B_TRUE; } /* * Try to detect invalid block pointer. If invalid, try * decompressing. */ if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (lsize == -1 || zfs_blkptr_verify(spa, b, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } } } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, lsize, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, lsize, flags); /* * If :c was specified, iterate through the checksum table to * calculate and display each checksum for our specified * DVA and length. */ if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && !(flags & ZDB_FLAG_GBH)) { zio_t *czio; (void) printf("\n"); for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { if ((zio_checksum_table[ck].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) || ck == ZIO_CHECKSUM_NOPARITY) { continue; } BP_SET_CHECKSUM(bp, ck); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); if (vd == vd->vdev_top) { zio_nowait(zio_read(czio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_DONT_RETRY, NULL)); } else { zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(czio); if (error == 0 || error == ECKSUM) { zio_t *ck_zio = zio_null(NULL, spa, NULL, NULL, NULL, 0); ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; zio_checksum_compute(ck_zio, ck, pabd, lsize); printf( "%12s\t" "cksum=%016llx:%016llx:%016llx:%016llx\n", zio_checksum_table[ck].ci_name, (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); zio_wait(ck_zio); } else { printf("error %d reading block\n", error); } spa_config_exit(spa, SCL_STATE, FTAG); } } if (borrowed) abd_return_buf_copy(pabd, buf, lsize); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); done: free(flagstr); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp = {{{{0}}}}; unsigned long long *words = (void *)&bp; char *buf; int err; err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); zdb_exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); zdb_exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); zdb_exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } /* check for valid hex or decimal numeric string */ static boolean_t zdb_numeric(char *str) { int i = 0, len; len = strlen(str); if (len == 0) return (B_FALSE); if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) i = 2; for (; i < len; i++) { if (!isxdigit(str[i])) return (B_FALSE); } return (B_TRUE); } static int dummy_get_file_info(dmu_object_type_t bonustype, const void *data, zfs_file_info_t *zoi) { (void) data, (void) zoi; if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (ENOENT); (void) fprintf(stderr, "dummy_get_file_info: not implemented"); abort(); } int main(int argc, char **argv) { int c; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int64_t objset_id = -1; uint64_t object; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; struct sigaction action; boolean_t force_import = B_FALSE; boolean_t config_path_console = B_FALSE; char pbuf[MAXPATHLEN]; dprintf_setup(&argc, argv); /* * Set up signal handlers, so if we crash due to bad on-disk data we * can get more info. Unlike ztest, we don't bail out if we can't set * up signal handlers, because zdb is very useful without them. */ action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", strerror(errno)); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", strerror(errno)); } /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; /* * For performance reasons, we set this tunable down. We do so before * the arg parsing section so that the user can override this value if * they choose. */ zfs_btree_verify_intensity = 3; struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, {"dedup-stats", no_argument, NULL, 'D'}, {"exported", no_argument, NULL, 'e'}, {"embedded-block-pointer", no_argument, NULL, 'E'}, {"automatic-rewind", no_argument, NULL, 'F'}, {"dump-debug-msg", no_argument, NULL, 'G'}, {"history", no_argument, NULL, 'h'}, {"intent-logs", no_argument, NULL, 'i'}, {"inflight", required_argument, NULL, 'I'}, {"checkpointed-state", no_argument, NULL, 'k'}, {"key", required_argument, NULL, 'K'}, {"label", no_argument, NULL, 'l'}, {"disable-leak-tracking", no_argument, NULL, 'L'}, {"metaslabs", no_argument, NULL, 'm'}, {"metaslab-groups", no_argument, NULL, 'M'}, {"numeric", no_argument, NULL, 'N'}, {"option", required_argument, NULL, 'o'}, {"object-lookups", no_argument, NULL, 'O'}, {"path", required_argument, NULL, 'p'}, {"parseable", no_argument, NULL, 'P'}, {"skip-label", no_argument, NULL, 'q'}, {"copy-object", no_argument, NULL, 'r'}, {"read-block", no_argument, NULL, 'R'}, {"io-stats", no_argument, NULL, 's'}, {"simulate-dedup", no_argument, NULL, 'S'}, {"txg", required_argument, NULL, 't'}, {"brt-stats", no_argument, NULL, 'T'}, {"uberblock", no_argument, NULL, 'u'}, {"cachefile", required_argument, NULL, 'U'}, {"verbose", no_argument, NULL, 'v'}, {"verbatim", no_argument, NULL, 'V'}, {"dump-blocks", required_argument, NULL, 'x'}, {"extreme-rewind", no_argument, NULL, 'X'}, {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'N': case 'O': case 'r': case 'R': case 's': case 'S': case 'T': case 'u': case 'y': case 'Z': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; case 'Y': zfs_reconstruct_indirect_combinations_max = INT_MAX; zfs_deadman_enabled = 0; break; /* NB: Sort single match options below. */ case 'I': max_inflight_bytes = strtoull(optarg, NULL, 0); if (max_inflight_bytes == 0) { (void) fprintf(stderr, "maximum number " "of inflight bytes must be greater " "than 0\n"); usage(); } break; case 'K': dump_opt[c]++; key_material = strdup(optarg); /* redact key material in process table */ while (*optarg != '\0') { *optarg++ = '*'; } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); memcpy(tmp, searchdirs, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': config_path_console = B_TRUE; spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } #if defined(_LP64) /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; zfs_arc_max = 256 * 1024 * 1024; #endif /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; /* * ZDB should have ability to read spacemaps. */ spa_mode_readable_spacemaps = B_TRUE; if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); target = argv[0]; /* * Automate cachefile */ if (!spa_config_path_env && !config_path_console && target && libzfs_core_init() == 0) { char *pname = strdup(target); const char *value; nvlist_t *pnvl = NULL; nvlist_t *vnvl = NULL; if (strpbrk(pname, "/@") != NULL) *strpbrk(pname, "/@") = '\0'; if (pname && lzc_get_props(pname, &pnvl) == 0) { if (nvlist_lookup_nvlist(pnvl, "cachefile", &vnvl) == 0) { value = fnvlist_lookup_string(vnvl, ZPROP_VALUE); } else { value = "-"; } strlcpy(pbuf, value, sizeof (pbuf)); if (pbuf[0] != '\0') { if (pbuf[0] == '/') { if (access(pbuf, F_OK) == 0) spa_config_path = pbuf; else force_import = B_TRUE; } else if ((strcmp(pbuf, "-") == 0 && access(ZPOOL_CACHE, F_OK) != 0) || strcmp(pbuf, "none") == 0) { force_import = B_TRUE; } } nvlist_free(vnvl); } free(pname); nvlist_free(pnvl); libzfs_core_fini(); } dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info); kernel_init(SPA_MODE_READ); kernel_init_done = B_TRUE; if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); error = 0; goto fini; } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); error = 0; goto fini; } usage(); } if (dump_opt['l']) { error = dump_label(argv[0]); goto fini; } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); /* -N implies -d */ if (dump_opt['N'] && dump_opt['d'] == 0) dump_opt['d'] = dump_opt['N']; if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_pool = strdup(target); *strpbrk(target_pool, "/@") = '\0'; target_is_spa = B_FALSE; targetlen = strlen(target); if (targetlen && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; /* * See if an objset ID was supplied (-d /). * To disambiguate tank/100, consider the 100 as objsetID * if -N was given, otherwise 100 is an objsetID iff * tank/100 as a named dataset fails on lookup. */ objset_str = strchr(target, '/'); if (objset_str && strlen(objset_str) > 1 && zdb_numeric(objset_str + 1)) { char *endptr; errno = 0; objset_str++; objset_id = strtoull(objset_str, &endptr, 0); /* dataset 0 is the same as opening the pool */ if (errno == 0 && endptr != objset_str && objset_id != 0) { if (dump_opt['N']) dataset_lookup = B_TRUE; } /* normal dataset name not an objset ID */ if (endptr == objset_str) { objset_id = -1; } } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); error = 1; goto fini; } } else { target_pool = target; } if (dump_opt['e'] || force_import) { importargs_t args = { 0 }; /* * If path is not provided, search in /dev */ if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); searchdirs[nsearch++] = (char *)ZFS_DEVDIR; } args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; libpc_handle_t lpch = { .lpc_lib_handle = NULL, .lpc_ops = &libzpool_config_ops, .lpc_printerr = B_TRUE }; error = zpool_find_config(&lpch, target_pool, &cfg, &args); if (error == 0) { if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } /* * Disable the activity check to allow examination of * active pools. */ error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } if (searchdirs != NULL) { umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = NULL; } /* * We need to make sure to process -O option or call * dump_path after the -e option has been processed, * which imports the pool to the namespace if it's * not in the cachefile. */ if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; error = dump_path(argv[0], argv[1], NULL); goto fini; } if (dump_opt['r']) { target_is_spa = B_FALSE; if (argc != 3) usage(); dump_opt['v'] = verbose; error = dump_path(argv[0], argv[1], &object); if (error != 0) fatal("internal error: %s", strerror(error)); } /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa * namespace. Because of that we need to make sure to call * it always after the -e option has been processed, which * imports the pool to the namespace if it's not in the * cachefile. */ char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (cfg != NULL) { nvlist_free(cfg); cfg = NULL; } if (target_pool != target) free(target_pool); if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else if (strpbrk(target, "#") != NULL) { dsl_pool_t *dp; error = dsl_pool_hold(target, FTAG, &dp); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } error = dump_bookmark(dp, target, B_TRUE, verbose > 1); dsl_pool_rele(dp, FTAG); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } goto fini; } else { target_pool = strdup(target); if (strpbrk(target, "/@") != NULL) *strpbrk(target_pool, "/@") = '\0'; zdb_set_skip_mmp(target); /* * If -N was supplied, the user has indicated that * zdb -d / is in effect. Otherwise * we first assume that the dataset string is the * dataset name. If dmu_objset_hold fails with the * dataset string, and we have an objset_id, retry the * lookup with the objsetID. */ boolean_t retry = B_TRUE; retry_lookup: if (dataset_lookup == B_TRUE) { /* * Use the supplied id to get the name * for open_objset. */ error = spa_open(target_pool, &spa, FTAG); if (error == 0) { error = name_from_objset_id(spa, objset_id, dsname); spa_close(spa, FTAG); if (error == 0) target = dsname; } } if (error == 0) { if (objset_id > 0 && retry) { int err = dmu_objset_hold(target, FTAG, &os); if (err) { dataset_lookup = B_TRUE; retry = B_FALSE; goto retry_lookup; } else { dmu_objset_rele(os, FTAG); } } error = open_objset(target, FTAG, &os); } if (error == 0) spa = dmu_objset_spa(os); free(target_pool); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); /* * Set the pool failure mode to panic in order to prevent the pool * from suspending. A suspended I/O will have no way to resume and * can prevent the zdb(8) command from terminating as expected. */ if (spa != NULL) spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; argv++; argc--; if (dump_opt['r']) { error = zdb_copy_object(os, object, argv[1]); } else if (!dump_opt['R']) { flagbits['d'] = ZOR_FLAG_DIRECTORY; flagbits['f'] = ZOR_FLAG_PLAIN_FILE; flagbits['m'] = ZOR_FLAG_SPACE_MAP; flagbits['z'] = ZOR_FLAG_ZAP; flagbits['A'] = ZOR_FLAG_ALL_TYPES; if (argc > 0 && dump_opt['d']) { zopt_object_args = argc; zopt_object_ranges = calloc(zopt_object_args, sizeof (zopt_object_range_t)); for (unsigned i = 0; i < zopt_object_args; i++) { int err; const char *msg = NULL; err = parse_object_range(argv[i], &zopt_object_ranges[i], &msg); if (err != 0) fatal("Bad object or range: '%s': %s\n", argv[i], msg ?: ""); } } else if (argc > 0 && dump_opt['m']) { zopt_metaslab_args = argc; zopt_metaslab = calloc(zopt_metaslab_args, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_metaslab_args; i++) { errno = 0; zopt_metaslab[i] = strtoull(argv[i], NULL, 0); if (zopt_metaslab[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (dump_opt['B']) { dump_backup(target, objset_id, argc > 0 ? argv[0] : NULL); } else if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['r'] = ZDB_FLAG_RAW; flagbits['v'] = ZDB_FLAG_VERBOSE; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } fini: if (spa != NULL) zdb_ddt_cleanup(spa); if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { spa_close(spa, FTAG); } fuid_table_destroy(); dump_debug_buffer(); if (kernel_init_done) kernel_fini(); return (error); } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index eae543731224..9c35f27ff0b4 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -1,574 +1,574 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Metaslab allocation tracing record. */ typedef struct metaslab_alloc_trace { list_node_t mat_list_node; metaslab_group_t *mat_mg; metaslab_t *mat_msp; uint64_t mat_size; uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; int mat_allocator; } metaslab_alloc_trace_t; /* * Used by the metaslab allocation tracing facility to indicate * error conditions. These errors are stored to the offset member * of the metaslab_alloc_trace_t record and displayed by mdb. */ typedef enum trace_alloc_type { TRACE_ALLOC_FAILURE = -1ULL, TRACE_TOO_SMALL = -2ULL, TRACE_FORCE_GANG = -3ULL, TRACE_NOT_ALLOCATABLE = -4ULL, TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, TRACE_VDEV_ERROR = -8ULL, TRACE_DISABLED = -9ULL, } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) #define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the * metaslabs by weight. The weight (and therefore the "best" metaslab) can * be determined in two different ways: by computing a weighted sum of all * the free space in the metaslab (a space based weight) or by counting only * the free segments of the largest size (a segment based weight). We prefer * the segment based weight because it reflects how the free space is * comprised, but we cannot always use it -- legacy pools do not have the * space map histogram information necessary to determine the largest * contiguous regions. Pools that have the space map histogram determine * the segment weight by looking at each bucket in the histogram and * determining the free space whose size in bytes is in the range: * [2^i, 2^(i+1)) * We then encode the largest index, i, that contains regions into the * segment-weighted value. * * Space-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ ((weight) == 0 || BF64_GET((weight), 60, 1)) #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * Per-allocator data structure. */ typedef struct metaslab_class_allocator { metaslab_group_t *mca_rotor; uint64_t mca_aliquot; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must * first reserve the number of blocks that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current * number of reserved allocations is maintained by the mca_alloc_slots * refcount. The mca_alloc_max_slots value determines the maximum * number of allocations that the system allows. Gang blocks are * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ uint64_t mca_alloc_max_slots; zfs_refcount_t mca_alloc_slots; } ____cacheline_aligned metaslab_class_allocator_t; /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include * "normal" for data block allocations (i.e. main pool allocations) or "log" * for allocations designated for intent log devices (i.e. slog devices). * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done * by traversing the metaslab groups that are linked off of the mca_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab * class defines the low-level block allocator that will be used as the * final step in allocation. These allocators are pluggable allowing each class * to use a block allocator that best suits that class. */ struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; const metaslab_ops_t *mc_ops; /* * Track the number of metaslab groups that have been initialized * and can accept allocations. An initialized metaslab group is * one has been completely added to the config (i.e. we have * updated the MOS config and the space has been added to the pool). */ uint64_t mc_groups; /* * Toggle to enable/disable the allocation throttle. */ boolean_t mc_alloc_throttle_enabled; uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ - uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t mc_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; /* * List of all loaded metaslabs in the class, sorted in order of most * recent use. */ multilist_t mc_metaslab_txg_list; metaslab_class_allocator_t mc_allocator[]; }; /* * Per-allocator data structure. */ typedef struct metaslab_group_allocator { uint64_t mga_cur_max_alloc_queue_depth; zfs_refcount_t mga_alloc_queue_depth; metaslab_t *mga_primary; metaslab_t *mga_secondary; } metaslab_group_allocator_t; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked together to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will * simply find the next metaslab group in the linked list and attempt * to allocate from that group instead. */ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after * we have updated the MOS config and added the space to the pool. * We only allow allocation attempts to a metaslab group if it * has been initialized. */ boolean_t mg_initialized; uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; /* * In order for the allocation throttle to function properly, we cannot * have too many IOs going to each disk by default; the throttle * operates by allocating more work to disks that finish quickly, so * allocating larger chunks to each disk reduces its effectiveness. * However, if the number of IOs going to each allocator is too small, * we will not perform proper aggregation at the vdev_queue layer, * also resulting in decreased performance. Therefore, we will use a * ramp-up strategy. * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given * allocator on a given metaslab group, we increment its cur_max until * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to * help protect against disks that decrease in performance over time. * * It's possible for an allocator to handle more allocations than * its max. This can occur when gang blocks are required or when other * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out * of space then its share of work must be distributed to other * groups. */ boolean_t mg_no_free_space; uint64_t mg_allocations; uint64_t mg_failed_allocations; uint64_t mg_fragmentation; - uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_disabled; boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; int mg_allocators; metaslab_group_allocator_t mg_allocator[]; }; /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 /* * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees * (ms_defer). Once a block is freed it will move from the * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. * A pool could be safely rolled back TXG_DEFERS_SIZE transactions * groups and ensure that no block has been reallocated. * * The simplified transition diagram looks like this: * * * ALLOCATE * | * V * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ * | ms_freeing <--- FREE * | | * | v * | ms_freed * | | * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map. The * pool space is only updated once all metaslabs have finished syncing. * * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These * segments are represented in-core by the ms_allocatable and are stored * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core * free tree is zfs_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is * being written. */ struct metaslab { /* * This is the main lock of the metaslab and its purpose is to * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram * aggregation, or marking the metaslab for initialization. */ kmutex_t ms_lock; /* * Acquired together with the ms_lock whenever we expect to * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading * of the metaslab's space map based on ms_synced_length. That * said, the function still uses the ms_sync_lock after it * has read the ms_sm [see relevant comment in metaslab_load() * as to why]. */ kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; uint64_t ms_fragmentation; zfs_range_tree_t *ms_allocating[TXG_SIZE]; zfs_range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; uint64_t ms_allocating_total; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ zfs_range_tree_t *ms_freeing; /* to free this syncing txg */ /* already freed this syncing txg */ zfs_range_tree_t *ms_freed; zfs_range_tree_t *ms_defer[TXG_DEFER_SIZE]; /* to add to the checkpoint */ zfs_range_tree_t *ms_checkpointing; /* * The ms_trim tree is the set of allocatable segments which are * eligible for trimming. (When the metaslab is loaded, it's a * subset of ms_allocatable.) It's kept in-core as long as the * autotrim property is set and is not vacated when the metaslab * is unloaded. Its purpose is to aggregate freed ranges to * facilitate efficient trimming. */ zfs_range_tree_t *ms_trim; boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; /* * The number of consumers which have disabled the metaslab. */ uint64_t ms_disabled; /* * We must always hold the ms_lock when modifying ms_loaded * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; kcondvar_t ms_flush_cv; boolean_t ms_flushing; /* * The following histograms count entries that are in the * metaslab's space map (and its histogram) but are not in * ms_allocatable yet, because they are in ms_freed, ms_freeing, * or ms_defer[]. * * When the metaslab is not loaded, its ms_weight needs to * reflect what is allocatable (i.e. what will be part of * ms_allocatable if it is loaded). The weight is computed from * the spacemap histogram, but that includes ranges that are * not yet allocatable (because they are in ms_freed, * ms_freeing, or ms_defer[]). Therefore, when calculating the * weight, we need to remove those ranges. * * The ranges in the ms_freed and ms_defer[] range trees are all * present in the spacemap. However, the spacemap may have * multiple entries to represent a contiguous range, because it * is written across multiple sync passes, but the changes of * all sync passes are consolidated into the range trees. * Adjacent ranges that are freed in different sync passes of * one txg will be represented separately (as 2 or more entries) * in the space map (and its histogram), but these adjacent * ranges will be consolidated (represented as one entry) in the * ms_freed/ms_defer[] range trees (and their histograms). * * When calculating the weight, we can not simply subtract the * range trees' histograms from the spacemap's histogram, * because the range trees' histograms may have entries in * higher buckets than the spacemap, due to consolidation. * Instead we must subtract the exact entries that were added to * the spacemap's histogram. ms_synchist and ms_deferhist[] * represent these exact entries, so we can subtract them from * the spacemap's histogram when calculating ms_weight. * * ms_synchist represents the same ranges as ms_freeing + * ms_freed, but without consolidation across sync passes. * * ms_deferhist[i] represents the same ranges as ms_defer[i], * but without consolidation across sync passes. */ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; /* * Tracks the exact amount of allocated space of this metaslab * (and specifically the metaslab's space map) up to the most * recently completed sync pass [see usage in metaslab_sync()]. */ uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ /* * Track of whenever a metaslab is selected for loading or allocation. * We use this value to determine how long the metaslab should * stay cached. */ uint64_t ms_selected_txg; /* * ms_load/unload_time can be used for performance monitoring * (e.g. by dtrace or mdb). */ hrtime_t ms_load_time; /* time last loaded */ hrtime_t ms_unload_time; /* time last unloaded */ hrtime_t ms_selected_time; /* time last allocated from */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* * -1 if it's not active in an allocator, otherwise set to the allocator * this metaslab is active for. */ int ms_allocator; boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always * contain the same number of segments as the ms_allocatable. The * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ zfs_btree_t ms_allocatable_by_size; zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ /* * Node in metaslab class's selected txg list */ multilist_node_t ms_class_txg_node; /* * Allocs and frees that are committed to the vdev log spacemap but * not yet to this metaslab's spacemap. */ zfs_range_tree_t *ms_unflushed_allocs; zfs_range_tree_t *ms_unflushed_frees; /* * We have flushed entries up to but not including this TXG. In * other words, all changes from this TXG and onward should not * be in this metaslab's space map and must be read from the * log space maps. */ uint64_t ms_unflushed_txg; boolean_t ms_unflushed_dirty; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; boolean_t ms_new; }; typedef struct metaslab_unflushed_phys { /* on-disk counterpart of ms_unflushed_txg */ uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_IMPL_H */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 4b0a3f2bfbb1..23eea3210c98 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -1,326 +1,326 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H #include #include #ifdef __cplusplus extern "C" { #endif -#define RANGE_TREE_HISTOGRAM_SIZE 64 +#define ZFS_RANGE_TREE_HISTOGRAM_SIZE 64 typedef struct zfs_range_tree_ops zfs_range_tree_ops_t; typedef enum zfs_range_seg_type { ZFS_RANGE_SEG32, ZFS_RANGE_SEG64, ZFS_RANGE_SEG_GAP, ZFS_RANGE_SEG_NUM_TYPES, } zfs_range_seg_type_t; /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct zfs_range_tree { zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ zfs_range_seg_type_t rt_type; /* type of zfs_range_seg_t in use */ /* * All data that is stored in the range tree must have a start higher * than or equal to rt_start, and all sizes and offsets must be * multiples of 1 << rt_shift. */ uint8_t rt_shift; uint64_t rt_start; const zfs_range_tree_ops_t *rt_ops; void *rt_arg; uint64_t rt_gap; /* allowable inter-segment gap */ /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: * 2^i <= size of range in bytes < 2^(i+1) */ - uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t rt_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; } zfs_range_tree_t; -typedef struct range_seg32 { +typedef struct zfs_range_seg32 { uint32_t rs_start; /* starting offset of this segment */ uint32_t rs_end; /* ending offset (non-inclusive) */ -} range_seg32_t; +} zfs_range_seg32_t; /* * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may * require 64-bit integers for ranges. */ -typedef struct range_seg64 { +typedef struct zfs_range_seg64 { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ -} range_seg64_t; +} zfs_range_seg64_t; -typedef struct range_seg_gap { +typedef struct zfs_range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_gap_t; +} zfs_range_seg_gap_t; /* * This type needs to be the largest of the range segs, since it will be stack * allocated and then cast the actual type to do tree operations. */ -typedef range_seg_gap_t range_seg_max_t; +typedef zfs_range_seg_gap_t zfs_range_seg_max_t; /* * This is just for clarity of code purposes, so we can make it clear that a * pointer is to a range seg of some type; when we need to do the actual math, * we'll figure out the real type. */ typedef void zfs_range_seg_t; struct zfs_range_tree_ops { void (*rtop_create)(zfs_range_tree_t *rt, void *arg); void (*rtop_destroy)(zfs_range_tree_t *rt, void *arg); void (*rtop_add)(zfs_range_tree_t *rt, void *rs, void *arg); void (*rtop_remove)(zfs_range_tree_t *rt, void *rs, void *arg); void (*rtop_vacate)(zfs_range_tree_t *rt, void *arg); }; static inline uint64_t zfs_rs_get_start_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: - return (((const range_seg32_t *)rs)->rs_start); + return (((const zfs_range_seg32_t *)rs)->rs_start); case ZFS_RANGE_SEG64: - return (((const range_seg64_t *)rs)->rs_start); + return (((const zfs_range_seg64_t *)rs)->rs_start); case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_start); + return (((const zfs_range_seg_gap_t *)rs)->rs_start); default: VERIFY(0); return (0); } } static inline uint64_t zfs_rs_get_end_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: - return (((const range_seg32_t *)rs)->rs_end); + return (((const zfs_range_seg32_t *)rs)->rs_end); case ZFS_RANGE_SEG64: - return (((const range_seg64_t *)rs)->rs_end); + return (((const zfs_range_seg64_t *)rs)->rs_end); case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_end); + return (((const zfs_range_seg_gap_t *)rs)->rs_end); default: VERIFY(0); return (0); } } static inline uint64_t zfs_rs_get_fill_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: { - const range_seg32_t *r32 = (const range_seg32_t *)rs; + const zfs_range_seg32_t *r32 = (const zfs_range_seg32_t *)rs; return (r32->rs_end - r32->rs_start); } case ZFS_RANGE_SEG64: { - const range_seg64_t *r64 = (const range_seg64_t *)rs; + const zfs_range_seg64_t *r64 = (const zfs_range_seg64_t *)rs; return (r64->rs_end - r64->rs_start); } case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_fill); + return (((const zfs_range_seg_gap_t *)rs)->rs_fill); default: VERIFY(0); return (0); } } static inline uint64_t zfs_rs_get_start(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { return ((zfs_rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); } static inline uint64_t zfs_rs_get_end(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { return ((zfs_rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); } static inline uint64_t zfs_rs_get_fill(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { return (zfs_rs_get_fill_raw(rs, rt) << rt->rt_shift); } static inline void zfs_rs_set_start_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: ASSERT3U(start, <=, UINT32_MAX); - ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + ((zfs_range_seg32_t *)rs)->rs_start = (uint32_t)start; break; case ZFS_RANGE_SEG64: - ((range_seg64_t *)rs)->rs_start = start; + ((zfs_range_seg64_t *)rs)->rs_start = start; break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_start = start; + ((zfs_range_seg_gap_t *)rs)->rs_start = start; break; default: VERIFY(0); } } static inline void zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: ASSERT3U(end, <=, UINT32_MAX); - ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + ((zfs_range_seg32_t *)rs)->rs_end = (uint32_t)end; break; case ZFS_RANGE_SEG64: - ((range_seg64_t *)rs)->rs_end = end; + ((zfs_range_seg64_t *)rs)->rs_end = end; break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_end = end; + ((zfs_range_seg_gap_t *)rs)->rs_end = end; break; default: VERIFY(0); } } static inline void zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: /* fall through */ case ZFS_RANGE_SEG64: ASSERT3U(fill, ==, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_fill = fill; + ((zfs_range_seg_gap_t *)rs)->rs_fill = fill; break; default: VERIFY(0); } } static inline void zfs_rs_set_start(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start) { ASSERT3U(start, >=, rt->rt_start); ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); zfs_rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); } static inline void zfs_rs_set_end(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) { ASSERT3U(end, >=, rt->rt_start); ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); zfs_rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); } static inline void zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); } typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size); zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap); zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); void zfs_range_tree_destroy(zfs_range_tree_t *rt); boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size); zfs_range_seg_t *zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size); boolean_t zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize); void zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t start, uint64_t size); void zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t zfs_range_tree_space(zfs_range_tree_t *rt); uint64_t zfs_range_tree_numsegs(zfs_range_tree_t *rt); boolean_t zfs_range_tree_is_empty(zfs_range_tree_t *rt); void zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst); void zfs_range_tree_stat_verify(zfs_range_tree_t *rt); uint64_t zfs_range_tree_min(zfs_range_tree_t *rt); uint64_t zfs_range_tree_max(zfs_range_tree_t *rt); uint64_t zfs_range_tree_span(zfs_range_tree_t *rt); void zfs_range_tree_add(void *arg, uint64_t start, uint64_t size); void zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size); void zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size); void zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, int64_t delta); void zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size); void zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, void *arg); void zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, void *arg); zfs_range_seg_t *zfs_range_tree_first(zfs_range_tree_t *rt); void zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, zfs_range_tree_t *removefrom, zfs_range_tree_t *addto); void zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt, zfs_range_tree_t *removefrom, zfs_range_tree_t *addto); #ifdef __cplusplus } #endif #endif /* _SYS_RANGE_TREE_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 38f62b07dc59..6ab7ac40bb07 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -1,232 +1,232 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_VDEV_H #define _SYS_VDEV_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef enum vdev_dtl_type { DTL_MISSING, /* 0% replication: no copies of the data */ DTL_PARTIAL, /* less than 100% replication: some copies missing */ DTL_SCRUB, /* unable to fully repair during scrub/resilver */ DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ DTL_TYPES } vdev_dtl_type_t; extern int zfs_nocacheflush; typedef boolean_t vdev_open_children_func_t(vdev_t *vd); extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_concrete(vdev_t *vd); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); extern int vdev_count_leaves(spa_t *spa); extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx); extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size); extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, uint64_t offset, uint64_t size, dmu_tx_t *tx); extern boolean_t vdev_replace_in_progress(vdev_t *vdev); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, const char *tag); -typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); +typedef void vdev_xlate_func_t(void *arg, zfs_range_seg64_t *physical_rs); -extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); -extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, +extern boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs); +extern void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); extern boolean_t vdev_children_are_offline(vdev_t *vd); extern void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* * Return the amount of space allocated for a gang block header. Note that * since the physical birth txg is not provided, this must be constant for * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern int vdev_remove_wanted(spa_t *spa, uint64_t guid); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); extern void vdev_defer_resilver(vdev_t *vd); extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, VDEV_CONFIG_MOS = 1 << 2, VDEV_CONFIG_MISSING = 1 << 3 } vdev_config_flag_t; extern void vdev_post_kobj_evt(vdev_t *vd); extern void vdev_clear_kobj_evt(vdev_t *vd); extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); /* * Label routines */ struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); extern int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl); extern int vdev_prop_get(vdev_t *vd, nvlist_t *nvprops, nvlist_t *outnvl); #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 6840ee78915e..315e2fc88410 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -1,671 +1,671 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; extern uint_t zfs_vdev_def_queue_depth; extern uint_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, - range_seg64_t *physical, range_seg64_t *remain); +typedef void vdev_xlation_func_t(vdev_t *cvd, const zfs_range_seg64_t *logical, + zfs_range_seg64_t *physical, zfs_range_seg64_t *remain); typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { vdev_init_func_t *vdev_op_init; vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; vdev_xlation_func_t *vdev_op_xlate; vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; vdev_metaslab_init_func_t *vdev_op_metaslab_init; vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ typedef union vdev_queue_class { struct { ulong_t vqc_list_numnodes; list_t vqc_list; }; avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ uint32_t vq_cqueued; /* Classes with queued I/Os. */ uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; }; typedef enum vdev_alloc_bias { VDEV_BIAS_NONE, VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ } vdev_alloc_bias_t; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ /* * Logical block alignment shift * * The smallest sized/aligned I/O supported by the device. */ uint64_t vdev_logical_ashift; /* * Physical block alignment shift * * The device supports logical I/Os with vdev_logical_ashift * size/alignment, but optimum performance will be achieved by * aligning/sizing requests to vdev_physical_ashift. Smaller * requests may be inflated or incur device level read-modify-write * operations. * * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). */ uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ int vdev_validate_error; /* error on last validate */ kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ uint64_t vdev_root_zap; /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ /* Initialize related */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; list_node_t vdev_initialize_node; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; /* valid while initializing */ zfs_range_tree_t *vdev_initialize_tree; uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; uint64_t vdev_initialize_action_time; /* start and end time */ /* TRIM related */ boolean_t vdev_trim_exit_wanted; boolean_t vdev_autotrim_exit_wanted; vdev_trim_state_t vdev_trim_state; list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; kcondvar_t vdev_trim_cv; kthread_t *vdev_trim_thread; uint64_t vdev_trim_offset[TXG_SIZE]; uint64_t vdev_trim_last_offset; uint64_t vdev_trim_bytes_est; uint64_t vdev_trim_bytes_done; uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ uint64_t vdev_trim_partial; /* requested partial TRIM */ uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ /* Rebuild related */ boolean_t vdev_rebuilding; boolean_t vdev_rebuild_exit_wanted; boolean_t vdev_rebuild_cancel_wanted; boolean_t vdev_rebuild_reset_wanted; kmutex_t vdev_rebuild_lock; kcondvar_t vdev_rebuild_cv; kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; zfs_range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). */ kmutex_t vdev_scan_io_queue_lock; struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. */ zfs_range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_enc_sysfs_path; /* enclosure sysfs path */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_has_trim; /* TRIM is supported */ boolean_t vdev_has_securetrim; /* secure TRIM is supported */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. * * We also rate limit Direct I/O write verify errors, since a user might * be continually manipulating a buffer that can flood ZED with tons of * events. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_dio_verify_rl; zfs_ratelimit_t vdev_checksum_rl; /* * Vdev properties for tuning ZED or zfsd */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock * ring when MMP is enabled. */ #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef enum vbe_vers { /* * The bootenv file is stored as ascii text in the envblock. * It is used by the GRUB bootloader used on Linux to store the * contents of the grubenv file. The file is stored as raw ASCII, * and is protected by an embedded checksum. By default, GRUB will * check if the boot filesystem supports storing the environment data * in a special location, and if so, will invoke filesystem specific * logic to retrieve it. This can be overridden by a variable, should * the user so desire. */ VB_RAW = 0, /* * The bootenv file is converted to an nvlist and then packed into the * envblock. */ VB_NVLIST = 1 } vbe_vers_t; typedef struct vdev_boot_envblock { uint64_t vbe_version; char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - sizeof (zio_eck_t)]; zio_eck_t vbe_zbt; } vdev_boot_envblock_t; _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE, "vdev_boot_envblock_t wrong size"); typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_OFFSET_IS_LABEL(vd, off) \ (((off) < VDEV_LABEL_START_SIZE) || \ ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_draid_ops; extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables */ extern int zfs_vdev_standard_sm_blksz; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj); extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); #if defined(__linux__) int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); #endif int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); /* * Vdev ashift optimization tunables */ extern uint_t zfs_vdev_min_auto_ashift; extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); /* * VDEV checksum verification for Direct I/O writes */ extern uint_t zfs_vdev_direct_write_verify; #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index bc5c3cb9a670..5977f8c82b45 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1,5352 +1,5352 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Grand theory statement on scan queue sorting * * Scanning is implemented by recursively traversing all indirection levels * in an object and reading all blocks referenced from said objects. This * results in us approximately traversing the object from lowest logical * offset to the highest. For best performance, we would want the logical * blocks to be physically contiguous. However, this is frequently not the * case with pools given the allocation patterns of copy-on-write filesystems. * So instead, we put the I/Os into a reordering queue and issue them in a * way that will most benefit physical disks (LBA-order). * * Queue management: * * Ideally, we would want to scan all metadata and queue up all block I/O * prior to starting to issue it, because that allows us to do an optimal * sorting job. This can however consume large amounts of memory. Therefore * we continuously monitor the size of the queues and constrain them to 5% * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this * limit, we clear out a few of the largest extents at the head of the queues * to make room for more scanning. Hopefully, these extents will be fairly * large and contiguous, allowing us to approach sequential I/O throughput * even without a fully sorted tree. * * Metadata scanning takes place in dsl_scan_visit(), which is called from * dsl_scan_sync() every spa_sync(). If we have either fully scanned all * metadata on the pool, or we need to make room in memory because our * queues are too large, dsl_scan_visit() is postponed and * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies * that metadata scanning and queued I/O issuing are mutually exclusive. This * allows us to provide maximum sequential I/O throughput for the majority of * I/O's issued since sequential I/O performance is significantly negatively * impacted if it is interleaved with random I/O. * * Implementation Notes * * One side effect of the queued scanning algorithm is that the scanning code * needs to be notified whenever a block is freed. This is needed to allow * the scanning code to remove these I/Os from the issuing queue. Additionally, * we do not attempt to queue gang blocks to be issued sequentially since this * is very hard to do and would have an extremely limited performance benefit. * Instead, we simply issue gang I/Os as soon as we find them using the legacy * algorithm. * * Backwards compatibility * * This new algorithm is backwards compatible with the legacy on-disk data * structures (and therefore does not require a new feature flag). * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan * will stop scanning metadata (in logical order) and wait for all outstanding * sorted I/O to complete. Once this is done, we write out a checkpoint * bookmark, indicating that we have scanned everything logically before it. * If the pool is imported on a machine without the new sorting algorithm, * the scan simply resumes from the last checkpoint using the legacy algorithm. */ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static int scan_ds_queue_compare(const void *a, const void *b); static int scan_prefetch_queue_compare(const void *a, const void *b); static void scan_ds_queue_clear(dsl_scan_t *scn); static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn); static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg); static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_data_disks(spa_t *spa); static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; /* * 'zpool status' uses bytes processed per pass to report throughput and * estimate time remaining. We define a pass to start when the scanning * phase completes for a sequential resilver. Optionally, this value * may be used to reset the pass statistics every N txgs to provide an * estimated completion time based on currently observed performance. */ static uint_t zfs_scan_report_txgs = 0; /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed * this value can be set to 1 to enable checking before scanning each * block. */ static int zfs_scan_strict_mem_lim = B_FALSE; /* * Maximum number of parallelly executed bytes per leaf vdev. We attempt * to strike a balance here between keeping the vdev queues full of I/Os * at all times and not overflowing the queues to cause long latency, * which would cause long txg sync times. No matter what, we will not * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ static uint64_t zfs_scan_vdev_limit = 16 << 20; static uint_t zfs_scan_issue_strategy = 0; /* don't queue & sort zios, go direct */ static int zfs_scan_legacy = B_FALSE; static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ /* * fill_weight is non-tunable at runtime, so we copy it at module init from * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would * break queue sorting. */ static uint_t zfs_scan_fill_weight = 3; static uint64_t fill_weight; /* See dsl_scan_should_clear() for details on the memory limit tunables */ static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ /* fraction of physmem */ static uint_t zfs_scan_mem_lim_fact = 20; /* fraction of mem lim above */ static uint_t zfs_scan_mem_lim_soft_fact = 20; /* minimum milliseconds to scrub per txg */ static uint_t zfs_scrub_min_time_ms = 1000; /* minimum milliseconds to obsolete per txg */ static uint_t zfs_obsolete_min_time_ms = 500; /* minimum milliseconds to free per txg */ static uint_t zfs_free_min_time_ms = 1000; /* minimum milliseconds to resilver per txg */ static uint_t zfs_resilver_min_time_ms = 3000; static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ static uint64_t zfs_max_async_dedup_frees = 100000; /* set to disable resilver deferring */ static int zfs_resilver_disable_defer = B_FALSE; /* Don't defer a resilver if the one in progress only got this far: */ static uint_t zfs_resilver_defer_percent = 10; /* * We wait a few txgs after importing a pool to begin scanning so that * the import / mounting code isn't held up by scrub / resilver IO. * Unfortunately, it is a bit difficult to determine exactly how long * this will take since userspace will trigger fs mounts asynchronously * and the kernel will create zvol minors asynchronously. As a result, * the value provided here is a bit arbitrary, but represents a * reasonable estimate of how many txgs it will take to finish fully * importing a pool */ #define SCAN_IMPORT_WAIT_TXGS 5 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) #define DSL_SCAN_IS_SCRUB(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB) /* * Enable/disable the processing of the free_bpobj object. */ static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ typedef struct { uint64_t sds_dsobj; uint64_t sds_txg; avl_node_t sds_node; } scan_ds_t; /* * This controls what conditions are placed on dsl_scan_sync_state(): * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0 * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0. * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise * write out the scn_phys_cached version. * See dsl_scan_sync_state for details. */ typedef enum { SYNC_OPTIONAL, SYNC_MANDATORY, SYNC_CACHED } state_sync_type_t; /* * This struct represents the minimum information needed to reconstruct a * zio for sequential scanning. This is useful because many of these will * accumulate in the sequential IO queues before being issued, so saving * memory matters here. */ typedef struct scan_io { /* fields from blkptr_t */ uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; uint32_t sio_nr_dvas; /* fields from zio_t */ uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; /* * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, * depending on how many were in the original bp. Only the * first DVA is really used for sorting and issuing purposes. * The other DVAs (if provided) simply exist so that the zio * layer can find additional copies to repair from in the * event of an error. This array must go at the end of the * struct to allow this for the variable number of elements. */ dva_t sio_dva[]; } scan_io_t; #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) #define SIO_GET_END_OFFSET(sio) \ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) #define SIO_GET_MUSED(sio) \ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ zio_t *q_zio; /* scn_zio_root child for waiting on IO */ /* trees used for sorting I/Os and extents of I/Os */ zfs_range_tree_t *q_exts_by_addr; zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; uint64_t q_last_ext_addr; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; uint64_t q_inflight_bytes; kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ /* per txg statistics */ uint64_t q_total_seg_size_this_txg; uint64_t q_segs_this_txg; uint64_t q_total_zio_size_this_txg; uint64_t q_zios_this_txg; }; /* private data for dsl_scan_prefetch_cb() */ typedef struct scan_prefetch_ctx { zfs_refcount_t spc_refcnt; /* refcount for memory management */ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ boolean_t spc_root; /* is this prefetch for an objset? */ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ } scan_prefetch_ctx_t; /* private data for dsl_scan_prefetch() */ typedef struct scan_prefetch_issue_ctx { avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ blkptr_t spic_bp; /* bp to prefetch */ zbookmark_phys_t spic_zb; /* bookmark to prefetch */ } scan_prefetch_issue_ctx_t; static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio); static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; /* sio->sio_nr_dvas must be set so we know which cache to free from */ static void sio_free(scan_io_t *sio) { ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); } /* It is up to the caller to set sio->sio_nr_dvas for freeing */ static scan_io_t * sio_alloc(unsigned short nr_dvas) { ASSERT3U(nr_dvas, >, 0); ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); } void scan_init(void) { /* * This is used in ext_size_compare() to weight segments * based on how sparse they are. This cannot be changed * mid-scan and the tree comparison functions don't currently * have a mechanism for passing additional context to the * compare functions. Thus we store this value globally and * we only allow it to be set at module initialization time */ fill_weight = zfs_scan_fill_weight; for (int i = 0; i < SPA_DVAS_PER_BP; i++) { char name[36]; (void) snprintf(name, sizeof (name), "sio_cache_%d", i); sio_cache[i] = kmem_cache_create(name, (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), 0, NULL, NULL, NULL, NULL, NULL, 0); } } void scan_fini(void) { for (int i = 0; i < SPA_DVAS_PER_BP; i++) { kmem_cache_destroy(sio_cache[i]); } } static inline boolean_t dsl_scan_is_running(const dsl_scan_t *scn) { return (scn->scn_phys.scn_state == DSS_SCANNING); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dsl_scan_is_running(dp->dp_scan) && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } static inline void sio2bp(const scan_io_t *sio, blkptr_t *bp) { memset(bp, 0, sizeof (*bp)); bp->blk_prop = sio->sio_blk_prop; BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth); BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth); bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); /* * Copy the DVAs to the sio. We need all copies of the block so * that the self healing code can use the alternate copies if the * first is corrupted. We want the DVA at index dva_i to be first * in the sio since this is the primary one that we want to issue. */ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; } } int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). * Limits for the issuing phase are done per top-level vdev and * are handled separately. */ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress for %s; " "restarting new-style scrub in txg %llu", spa->spa_name, (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify * the administrator by setting an errata for the pool. */ if (err == EOVERFLOW) { uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, (23 * sizeof (uint64_t))); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); if (err == 0) { uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; if (overflow & ~DSL_SCAN_FLAGS_MASK || scn->scn_async_destroying) { spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; return (EOVERFLOW); } memcpy(&scn->scn_phys, zaptmp, SCAN_PHYS_NUMINTS * sizeof (uint64_t)); scn->scn_phys.scn_flags = overflow; /* Required scrub already in progress. */ if (scn->scn_phys.scn_state == DSS_FINISHED || scn->scn_phys.scn_state == DSS_CANCELED) spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_SCRUB; } } if (err == ENOENT) return (0); else if (err) return (err); /* * We might be restarting after a reboot, so jump the issued * counter to how far we've scanned. We know we're consistent * up to here. */ scn->scn_issued_before_pass = scn->scn_phys.scn_examined - scn->scn_phys.scn_skipped; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub for %s was modified " "by old software; restarting in txg %llu", spa->spa_name, (longlong_t)scn->scn_restart_txg); } else if (dsl_scan_resilvering(dp)) { /* * If a resilver is in progress and there are already * errors, restart it instead of finishing this scan and * then restarting it. If there haven't been any errors * then remember that the incore DTL is valid. */ if (scn->scn_phys.scn_errors > 0) { scn->scn_restart_txg = txg; zfs_dbgmsg("resilver can't excise DTL_MISSING " "when finished; restarting on %s in txg " "%llu", spa->spa_name, (u_longlong_t)scn->scn_restart_txg); } else { /* it's safe to excise DTL when finished */ spa->spa_scrub_started = B_TRUE; } } } memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za->za_name, NULL), za->za_first_integer); } zap_cursor_fini(&zc); zap_attribute_free(za); } ddt_walk_init(spa, scn->scn_phys.scn_max_txg); spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan != NULL) { dsl_scan_t *scn = dp->dp_scan; if (scn->scn_taskq != NULL) taskq_destroy(scn->scn_taskq); scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } static boolean_t dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) { return (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg); } boolean_t dsl_scan_resilver_scheduled(dsl_pool_t *dp) { return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); } boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; return (scn_phys->scn_state == DSS_SCANNING && scn_phys->scn_func == POOL_SCAN_SCRUB); } boolean_t dsl_errorscrubbing(const dsl_pool_t *dp) { dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); } boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn) { return (dsl_errorscrubbing(scn->scn_dp) && scn->errorscrub_phys.dep_paused_flags); } boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } static void dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { scn->errorscrub_phys.dep_cursor = zap_cursor_serialize(&scn->errorscrub_cursor); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys, tx)); } static void dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT(!dsl_errorscrubbing(scn->scn_dp)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); scn->errorscrub_phys.dep_func = *funcp; scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; scn->errorscrub_phys.dep_start_time = gethrestime_sec(); scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); scn->errorscrub_phys.dep_examined = 0; scn->errorscrub_phys.dep_errors = 0; scn->errorscrub_phys.dep_cursor = 0; zap_cursor_init_serialized(&scn->errorscrub_cursor, spa->spa_meta_objset, spa->spa_errlog_last, scn->errorscrub_phys.dep_cursor); vdev_config_dirty(spa->spa_root_vdev); spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); dsl_errorscrub_sync_state(scn, tx); spa_history_log_internal(spa, "error scrub setup", tx, "func=%u mintxg=%u maxtxg=%llu", *funcp, 0, (u_longlong_t)tx->tx_txg); } static int dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { return (SET_ERROR(EBUSY)); } if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { return (ECANCELED); } return (0); } /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always * want to write out the record, only when it is "safe" to do so. This safety * condition is achieved by making sure that the sorting queues are empty * (scn_queues_pending == 0). When this condition is not true, the sync'd state * is inconsistent with how much actual scanning progress has been made. The * kind of sync to be performed is specified by the sync_type argument. If the * sync is optional, we only sync if the queues are empty. If the sync is * mandatory, we do a hard ASSERT to make sure that the queues are empty. The * third possible state is a "cached" sync. This is done in response to: * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been * destroyed, so we wouldn't be able to restart scanning from it. * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been * superseded by a newer snapshot. * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been * swapped with its clone. * In all cases, a cached sync simply rewrites the last record we've written, * just slightly modified. For the modifications that are performed to the * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. */ static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) { int i; spa_t *spa = scn->scn_dp->dp_spa; ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0); if (scn->scn_queues_pending == 0) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; if (q == NULL) continue; mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, NULL); ASSERT3P(zfs_range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } if (scn->scn_phys.scn_queue_obj != 0) scan_ds_queue_sync(scn, tx); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); if (scn->scn_checkpointing) zfs_dbgmsg("finish scan checkpoint for %s", spa->spa_name); scn->scn_checkpointing = B_FALSE; scn->scn_last_checkpoint = ddi_get_lbolt(); } else if (sync_type == SYNC_CACHED) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys_cached, tx)); } } int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); } void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE); ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); /* * If we are starting a fresh scrub, we erase the error scrub * information from disk. */ memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); dsl_errorscrub_sync_state(scn, tx); scn->scn_phys.scn_func = setup_sync_arg->func; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart; if (setup_sync_arg->txgend == 0) { scn->scn_phys.scn_max_txg = tx->tx_txg; } else { scn->scn_phys.scn_max_txg = setup_sync_arg->txgend; } scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; /* * When starting a resilver clear any existing rebuild state. * This is required to prevent stale rebuild status from * being reported when a rebuild is run, then a resilver and * finally a scrub. In which case only the scrub status * should be reported by 'zpool status'. */ if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; vdev_rebuild_clear_sync( (void *)(uintptr_t)vd->vdev_id, tx); } } } /* back to the generic stuff */ if (zfs_scan_blkstats) { if (dp->dp_blkstats == NULL) { dp->dp_blkstats = vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } memset(&dp->dp_blkstats->zab_type, 0, sizeof (dp->dp_blkstats->zab_type)); } else { if (dp->dp_blkstats) { vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); dp->dp_blkstats = NULL; } } if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); ddt_walk_init(spa, scn->scn_phys.scn_max_txg); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg, (u_longlong_t)scn->scn_phys.scn_max_txg); } /* * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, * error scrub or resilver. Can also be called to resume a paused scrub or * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; setup_sync_arg_t setup_sync_arg; if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) { return (EINVAL); } /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } if (func == POOL_SCAN_ERRORSCRUB) { if (dsl_errorscrub_is_paused(dp->dp_scan)) { /* * got error scrub start cmd, resume paused error scrub. */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_RESUME); return (ECANCELED); } return (SET_ERROR(err)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, &func, 0, ZFS_SPACE_CHECK_RESERVED)); } if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } return (SET_ERROR(err)); } setup_sync_arg.func = func; setup_sync_arg.txgstart = txgstart; setup_sync_arg.txgend = txgend; return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &setup_sync_arg, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static void dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; if (complete) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); spa_history_log_internal(spa, "error scrub done", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else { spa_history_log_internal(spa, "error scrub canceled", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; spa->spa_scrub_active = B_FALSE; spa_errlog_rotate(spa); scn->errorscrub_phys.dep_end_time = gethrestime_sec(); zap_cursor_fini(&scn->errorscrub_cursor); if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; ASSERT(!dsl_errorscrubbing(scn->scn_dp)); } static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } scan_ds_queue_clear(scn); scan_ds_prefetch_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (!dsl_scan_is_running(scn)) { ASSERT(!scn->scn_is_sorted); return; } if (scn->scn_is_sorted) { scan_io_queues_destroy(scn); scn->scn_is_sorted = B_FALSE; if (scn->scn_taskq != NULL) { taskq_destroy(scn->scn_taskq); scn->scn_taskq = NULL; } } scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; spa_notify_waiters(spa); if (dsl_scan_restarting(scn, tx)) { spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else if (!complete) { spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else { spa_history_log_internal(spa, "scan done", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB(scn)) { VERIFY0(zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LAST_SCRUBBED_TXG, sizeof (uint64_t), 1, &scn->scn_phys.scn_max_txg, tx)); spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg; } } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. * * As the scrub does not currently support traversing * data that have been freed but are part of a checkpoint, * we don't mark the scrub as done in the DTLs as faults * may still exist in those vdevs. */ if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); if (scn->scn_phys.scn_min_txg) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_FINISH); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_FINISH); } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); /* * Don't clear flag until after vdev_dtl_reassess to ensure that * DTL_MISSING will get updated when possible. */ spa->spa_scrub_started = B_FALSE; /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered * and start the resilver then. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, "starting deferred resilver", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } /* Clear recent error events (i.e. duplicate events tracking) */ if (complete) zfs_ereport_clear(spa, NULL); } scn->scn_phys.scn_end_time = gethrestime_sec(); if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; ASSERT(!dsl_scan_is_running(scn)); } static int dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* * can't pause a error scrub when there is no in-progress * error scrub. */ if (!dsl_errorscrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused error scrub */ if (dsl_errorscrub_is_paused(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); scn->errorscrub_phys.dep_paused_flags = B_TRUE; dsl_errorscrub_sync_state(scn, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_errorscrub_is_paused(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the error scrub * rate shown in the output of 'zpool status'. */ spa->spa_scan_pass_errorscrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_errorscrub_pause; spa->spa_scan_pass_errorscrub_pause = 0; scn->errorscrub_phys.dep_paused_flags = B_FALSE; zap_cursor_init_serialized( &scn->errorscrub_cursor, spa->spa_meta_objset, spa->spa_errlog_last, scn->errorscrub_phys.dep_cursor); dsl_errorscrub_sync_state(scn, tx); } } } static int dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; /* can't cancel a error scrub when there is no one in-progress */ if (!dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(ENOENT)); return (0); } static void dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_errorscrub_done(scn, B_FALSE, tx); dsl_errorscrub_sync_state(scn, tx); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_ABORT); } static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } int dsl_scan_cancel(dsl_pool_t *dp) { if (dsl_errorscrubbing(dp)) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ if (!dsl_scan_scrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused scrub */ if (dsl_scan_is_paused_scrub(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); spa_notify_waiters(spa); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the scrub rate * shown in the output of 'zpool status' */ spa->spa_scan_pass_scrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } /* * Set scrub pause/resume state if it makes sense to do so */ int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { if (dsl_errorscrubbing(dp)) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_pause_resume_check, dsl_errorscrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } /* start a new scan, or restart an existing one. */ void dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver for %s at txg=%llu", dp->dp_spa->spa_name, (longlong_t)txg); } void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } static int scan_ds_queue_compare(const void *a, const void *b) { const scan_ds_t *sds_a = a, *sds_b = b; if (sds_a->sds_dsobj < sds_b->sds_dsobj) return (-1); if (sds_a->sds_dsobj == sds_b->sds_dsobj) return (0); return (1); } static void scan_ds_queue_clear(dsl_scan_t *scn) { void *cookie = NULL; scan_ds_t *sds; while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { kmem_free(sds, sizeof (*sds)); } } static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); if (sds != NULL && txg != NULL) *txg = sds->sds_txg; return (sds != NULL); } static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) { scan_ds_t *sds; avl_index_t where; sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); sds->sds_dsobj = dsobj; sds->sds_txg = txg; VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); avl_insert(&scn->scn_queue, sds, where); } static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); VERIFY(sds != NULL); avl_remove(&scn->scn_queue, sds); kmem_free(sds, sizeof (*sds)); } static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; ASSERT0(scn->scn_queues_pending); ASSERT(scn->scn_phys.scn_queue_obj != 0); VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, DMU_OT_NONE, 0, tx); for (scan_ds_t *sds = avl_first(&scn->scn_queue); sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { VERIFY0(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, sds->sds_dsobj, sds->sds_txg, tx)); } } /* * Computes the memory limit state that we're currently in. A sorted scan * needs quite a bit of memory to hold the sorting queue, so we need to * reasonably constrain the size so it doesn't impact overall system * performance. We compute two limits: * 1) Hard memory limit: if the amount of memory used by the sorting * queues on a pool gets above this value, we stop the metadata * scanning portion and start issuing the queued up and sorted * I/Os to reduce memory usage. * This limit is calculated as a fraction of physmem (by default 5%). * We constrain the lower bound of the hard limit to an absolute * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain * the upper bound to 5% of the total pool size - no chance we'll * ever need that much memory, but just to keep the value in check. * 2) Soft memory limit: once we hit the hard memory limit, we start * issuing I/O to reduce queue memory usage, but we don't want to * completely empty out the queues, since we might be able to find I/Os * that will fill in the gaps of our non-sequential IOs at some point * in the future. So we stop the issuing of I/Os once the amount of * memory used drops below the soft limit (at which point we stop issuing * I/O and start scanning metadata again). * * This limit is calculated by subtracting a fraction of the hard * limit from the hard limit. By default this fraction is 5%, so * the soft limit is 95% of the hard limit. We cap the size of the * difference between the hard and soft limits at an absolute * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is * sufficient to not cause too frequent switching between the * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's * worth of queues is about 1.2 GiB of on-pool data, so scanning * that should take at least a decent fraction of a second). */ static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; uint64_t alloc, mlim_hard, mlim_soft, mused; alloc = metaslab_class_get_alloc(spa_normal_class(spa)); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); mlim_hard = MIN(mlim_hard, alloc / 20); mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, zfs_scan_mem_lim_soft_max); mused = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; dsl_scan_io_queue_t *queue; mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* * # of extents in exts_by_addr = # in exts_by_size. * B-tree efficiency is ~75%, but can be as low as 50%. */ - mused += zfs_btree_numnodes(&queue->q_exts_by_size) * - ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) * + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * (( + sizeof (zfs_range_seg_gap_t) + sizeof (uint64_t)) * 3 / 2) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); if (mused == 0) ASSERT0(scn->scn_queues_pending); /* * If we are above our hard limit, we need to clear out memory. * If we are below our soft limit, we need to accumulate sequential IOs. * Otherwise, we should keep doing whatever we are currently doing. */ if (mused >= mlim_hard) return (B_TRUE); else if (mused < mlim_soft) return (B_FALSE); else return (scn->scn_clearing); } static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_suspending) return (B_TRUE); /* we're already suspending */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 and objset blocks. */ if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) return (B_FALSE); /* * We suspend if: * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), someone is explicitly waiting for this txg * to complete, or we have used up all of the time in the txg * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. * or * - the scan queue has reached its memory use limit */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; if ((NSEC2MSEC(scan_time_ns) > mintime && (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) || !ddt_walk_ready(scn->scn_dp->dp_spa)) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) { dprintf("suspending at first available bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); SET_BOOKMARK(&scn->scn_phys.scn_bookmark, zb->zb_objset, 0, 0, 0); } else if (zb != NULL) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } else { #ifdef ZFS_DEBUG dsl_scan_phys_t *scnp = &scn->scn_phys; dprintf("suspending at at DDT bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); #endif } scn->scn_suspending = B_TRUE; return (B_TRUE); } return (B_FALSE); } static boolean_t dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* * We suspend if: * - we have scrubbed for at least the minimum time (default 1 sec * for error scrub), someone is explicitly waiting for this txg * to complete, or we have used up all of the time in the txg * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. */ uint64_t curr_time_ns = gethrtime(); uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int mintime = zfs_scrub_min_time_ms; if ((NSEC2MSEC(error_scrub_time_ns) > mintime && (txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("error scrub suspending at bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); } return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; static int dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) zilog; zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } static int dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, uint64_t claim_txg) { (void) zilog; if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; const lr_write_t *lr = (const lr_write_t *)lrc; const blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; ASSERT(spa_writeable(dp->dp_spa)); /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg, B_FALSE); zil_free(zilog); } /* * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea * here is to sort the AVL tree by the order each block will be needed. */ static int scan_prefetch_queue_compare(const void *a, const void *b) { const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; return (zbookmark_compare(spc_a->spc_datablkszsec, spc_a->spc_indblkshift, spc_b->spc_datablkszsec, spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); } static void scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag) { if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { zfs_refcount_destroy(&spc->spc_refcnt); kmem_free(spc, sizeof (scan_prefetch_ctx_t)); } } static scan_prefetch_ctx_t * scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag) { scan_prefetch_ctx_t *spc; spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); zfs_refcount_create(&spc->spc_refcnt); zfs_refcount_add(&spc->spc_refcnt, tag); spc->spc_scn = scn; if (dnp != NULL) { spc->spc_datablkszsec = dnp->dn_datablkszsec; spc->spc_indblkshift = dnp->dn_indblkshift; spc->spc_root = B_FALSE; } else { spc->spc_datablkszsec = 0; spc->spc_indblkshift = 0; spc->spc_root = B_TRUE; } return (spc); } static void scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag) { zfs_refcount_add(&spc->spc_refcnt, tag); } static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; void *cookie = NULL; scan_prefetch_issue_ctx_t *spic = NULL; mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue, &cookie)) != NULL) { scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, const zbookmark_phys_t *zb) { zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; dnode_phys_t tmp_dnp; dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; if (zb->zb_objset != last_zb->zb_objset) return (B_TRUE); if ((int64_t)zb->zb_object < 0) return (B_FALSE); tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; tmp_dnp.dn_indblkshift = spc->spc_indblkshift; if (zbookmark_subtree_completed(dnp, zb, last_zb)) return (B_TRUE); return (B_FALSE); } static void dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) { avl_index_t idx; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; if (dsl_scan_check_prefetch_resume(spc, zb)) return; scan_prefetch_ctx_add_ref(spc, scn); spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); spic->spic_spc = spc; spic->spic_bp = *bp; spic->spic_zb = *zb; /* * Add the IO to the queue of blocks to prefetch. This allows us to * prioritize blocks that we will need first for the main traversal * thread. */ mutex_enter(&spa->spa_scrub_lock); if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { /* this block is already queued for prefetch */ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); scan_prefetch_ctx_rele(spc, scn); mutex_exit(&spa->spa_scrub_lock); return; } avl_insert(&scn->scn_prefetch_queue, spic, idx); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } static void dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int i; zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) return; SET_BOOKMARK(&zb, objset, object, 0, 0); spc = scan_prefetch_ctx_create(scn, dnp, FTAG); for (i = 0; i < dnp->dn_nblkptr; i++) { zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); zb.zb_blkid = i; dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zb.zb_level = 0; zb.zb_blkid = DMU_SPILL_BLKID; dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); } scan_prefetch_ctx_rele(spc, FTAG); } static void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { (void) zio; scan_prefetch_ctx_t *spc = private; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; /* broadcast that the IO has completed for rate limiting purposes */ mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); /* if there was an error or we are done prefetching, just cleanup */ if (buf == NULL || scn->scn_prefetch_stop) goto out; if (BP_GET_LEVEL(bp) > 0) { int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t czb; for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_prefetch(spc, cbp, &czb); } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_prefetch_dnode(scn, cdnp, zb->zb_objset, zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { objset_phys_t *osp = buf->b_data; dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) { dsl_scan_prefetch_dnode(scn, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); } dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); dsl_scan_prefetch_dnode(scn, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } } out: if (buf != NULL) arc_buf_destroy(buf, private); scan_prefetch_ctx_rele(spc, scn); } static void dsl_scan_prefetch_thread(void *arg) { dsl_scan_t *scn = arg; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; /* loop until we are told to stop */ while (!scn->scn_prefetch_stop) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; mutex_enter(&spa->spa_scrub_lock); /* * Wait until we have an IO to issue and are not above our * maximum in flight limit. */ while (!scn->scn_prefetch_stop && (avl_numnodes(&scn->scn_prefetch_queue) == 0 || spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } /* recheck if we should stop since we waited for the cv */ if (scn->scn_prefetch_stop) { mutex_exit(&spa->spa_scrub_lock); break; } /* remove the prefetch IO from the tree */ spic = avl_first(&scn->scn_prefetch_queue); spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); avl_remove(&scn->scn_prefetch_queue, spic); mutex_exit(&spa->spa_scrub_lock); if (BP_IS_PROTECTED(&spic->spic_bp)) { ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); zio_flags |= ZIO_FLAG_RAW; } /* We don't need data L1 buffer since we do not prefetch L0. */ blkptr_t *bp = &spic->spic_bp; if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET) flags |= ARC_FLAG_NO_BUF; /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, spa, bp, dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT(scn->scn_prefetch_stop); /* free any prefetches we didn't get to complete */ mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { avl_remove(&scn->scn_prefetch_queue, spic); scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it, zero it out to indicate that it's OK * to start checking for suspending again. */ if (zbookmark_subtree_tbd(dnp, zb, &scn->scn_phys.scn_bookmark)) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb)); } } return (B_FALSE); } static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); inline __attribute__((always_inline)) static void dsl_scan_visitdnode( dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ inline __attribute__((always_inline)) static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; ASSERT(!BP_IS_REDACTED(bp)); /* * There is an unlikely case of encountering dnodes with contradicting * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created * or modified before commit 4254acb was merged. As it is not possible * to know which of the two is correct, report an error. */ if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; if (BP_IS_PROTECTED(bp)) { ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); zio_flags |= ZIO_FLAG_RAW; } err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group/project accounting * objects, and never skip them, even if we are * suspending. This is necessary so that the * space deltas from this txg get integrated. */ if (OBJSET_BUF_HAS_PROJECTUSED(buf)) dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_projectused_dnode, DMU_PROJECTUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); } else if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } return (0); } inline __attribute__((always_inline)) static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; scn->scn_visited_this_txg++; if (BP_IS_HOLE(bp)) { scn->scn_holes_this_txg++; return; } if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)); return; } /* * Check if this block contradicts any filesystem flags. */ spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) ASSERT(dsl_dataset_feature_is_active(ds, f)); f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0) return; /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { scn->scn_ddt_contained_this_txg++; return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; return; } scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { SET_BOOKMARK(&scn->scn_prefetch_bookmark, zb.zb_objset, 0, 0, 0); } else { scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; } scn->scn_objsets_visited_this_txg++; spc = scan_prefetch_ctx_create(scn, NULL, FTAG); dsl_scan_prefetch(spc, bp, &zb); scan_prefetch_ctx_rele(spc, FTAG); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } static void ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: * - scn_cur_{min,max}_txg stays the same. * - Setting the flag is not really necessary if * scn_cur_max_txg == scn_max_txg, because there * is nothing after this snapshot that we care * about. However, we set it anyway and then * ignore it when we retraverse it in * dsl_scan_visitds(). */ scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu on %s; currently " "traversing; reset zb_objset to %llu", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu on %s; currently " "traversing; reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name); } } } /* * Invoked when a dataset is destroyed. We need to make sure that: * * 1) If it is the dataset that was currently being scanned, we write * a new dsl_scan_phys_t and marking the objset reference in it * as destroyed. * 2) Remove it from the work queue, if it was present. * * If the dataset was actually a snapshot, instead of marking the dataset * as destroyed, we instead substitute the next snapshot in line. */ void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ds_destroyed_scn_phys(ds, &scn->scn_phys); ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); if (ds->ds_is_snapshot) scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu on %s; in queue; " "removing", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name); } } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds->ds_object) { scn_bookmark->zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } } /* * Called when a dataset is snapshotted. If we were currently traversing * this snapshot, we reset our bookmark to point at the newly created * snapshot. We also modify our work queue to remove the old snapshot and * replace with the new one. */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds1->ds_object) { scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, ds1->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); } else if (scn_bookmark->zb_objset == ds2->ds_object) { scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, ds2->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } } /* * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg1, mintxg2; boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); /* * Handle the in-memory scan queue. */ ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * The swapping code below would not handle this case correctly, * since we can't insert ds2 if it is already there. That's * because scan_ds_queue_insert() prohibits a duplicate insert * and panics. */ } else if (ds1_queued) { scan_ds_queue_remove(scn, ds1->ds_object); scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); } else if (ds2_queued) { scan_ds_queue_remove(scn, ds2->ds_object); scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); } /* * Handle the on-disk scan queue. * The on-disk state is an out-of-date version of the in-memory state, * so the in-memory and on-disk values for ds1_queued and ds2_queued may * be different. Therefore we need to apply the swap logic to the * on-disk state independently of the in-memory state. */ ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * Alternatively, we could check for EEXIST from * zap_add_int_key() and back out to the original state, but * that would be more work than checking for this case upfront. */ } else if (ds1_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, dp->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); } else if (ds2_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, dp->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (scn->scn_phys.scn_cur_min_txg >= scn->scn_phys.scn_max_txg) { /* * This can happen if this snapshot was created after the * scan started, and we already completed a previous snapshot * that was created after the scan started. This snapshot * only references blocks with: * * birth < our ds_creation_txg * cur_min_txg is no less than ds_creation_txg. * We have already visited these blocks. * or * birth > scn_max_txg * The scan requested not to visit these blocks. * * Subsequent snapshots (and clones) can reference our * blocks, or blocks with even higher birth times. * Therefore we do not need to visit them either, * so we do not add them to the work queue. * * Note that checking for cur_min_txg >= cur_max_txg * is not sufficient, because in that case we may need to * visit subsequent snapshots. This happens when min_txg > 0, * which raises cur_min_txg. In this case we will visit * this dataset but skip all of its blocks, because the * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. In addition, $ORIGIN * doesn't have a objset (i.e. its ds_bp is a hole) so we don't * need to look for a ZIL in it either. So we traverse the ZIL here, * rather than in scan_recurse(), because the regular snapshot * block-sharing rules don't apply to it. */ if (!dsl_dataset_is_snapshot(ds) && (dp->dp_origin_snap == NULL || ds->ds_dir != dp->dp_origin_snap->ds_dir)) { objset_t *os; if (dmu_objset_from_ds(ds, &os) != 0) { goto out; } dsl_scan_zil(dp, &os->os_zil_header); } /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "suspending=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_suspending); kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_suspending) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass on %s; visiting again", dp->dp_spa->spa_name); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; scan_ds_queue_insert(scn, ds->ds_object, scn->scn_phys.scn_cur_max_txg); goto out; } /* * Add descendant datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za->za_name, NULL), dsl_dataset_phys(ds)->ds_creation_txg); } zap_cursor_fini(&zc); zap_attribute_free(za); } else { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &ds->ds_object, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { (void) arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; const ddt_key_t *ddk = &ddlwe->ddlwe_key; blkptr_t bp; zbookmark_phys_t zb = { 0 }; if (!dsl_scan_is_running(scn)) return; /* * This function is special because it is the only thing * that can add scan_io_t's to the vdev scan queues from * outside dsl_scan_sync(). For the most part this is ok * as long as it is called from within syncing context. * However, dsl_scan_sync() expects that no new sio's will * be added between when all the work for a scan is done * and the next txg when the scan is actually marked as * completed. This check ensures we do not issue new sio's * during this period. */ if (scn->scn_done_txg != 0) return; for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (ddt_class_t) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_lightweight_entry_t ddlwe = {0}; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } if (error == EAGAIN) { dsl_scan_check_suspend(scn, NULL); error = 0; zfs_dbgmsg("waiting for ddt to become ready for scan " "on %s with class_max = %u; suspending=%u", scn->scn_dp->dp_spa->spa_name, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); } else zfs_dbgmsg("scanned %llu ddt entries on %s with " "class_max = %u; suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_suspending) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); if (scn->scn_suspending) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t)); /* * Keep pulling things out of the dataset avl queue. Updates to the * persistent zap-object-as-queue happen only at checkpoints. */ while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; uint64_t dsobj = sds->sds_dsobj; uint64_t txg = sds->sds_txg; /* dequeue and free the ds from the queue */ scan_ds_queue_remove(scn, dsobj); sds = NULL; /* set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (txg != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* No more objsets to fetch, we're done */ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; ASSERT0(scn->scn_suspending); } static uint64_t dsl_scan_count_data_disks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache) continue; leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd); } return (leaves); } static void scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) { int i; uint64_t cur_size = 0; for (i = 0; i < BP_GET_NDVAS(bp); i++) { cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); } q->q_total_zio_size_this_txg += cur_size; q->q_zios_this_txg++; } static void scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, uint64_t end) { q->q_total_seg_size_this_txg += end - start; q->q_segs_this_txg++; } static boolean_t scan_io_queue_check_suspend(dsl_scan_t *scn) { /* See comment in dsl_scan_check_suspend() */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; return ((NSEC2MSEC(scan_time_ns) > mintime && (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)); } /* * Given a list of scan_io_t's in io_list, this issues the I/Os out to * disk. This consumes the io_list and frees the scan_io_t's. This is * called when emptying queues, either when we're up against the memory * limit or when we have finished scanning. Returns B_TRUE if we stopped * processing the list before we finished. Any sios that were not issued * will remain in the io_list. */ static boolean_t scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; boolean_t suspended = B_FALSE; while ((sio = list_head(io_list)) != NULL) { blkptr_t bp; if (scan_io_queue_check_suspend(scn)) { suspended = B_TRUE; break; } sio2bp(sio, &bp); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); sio_free(sio); } return (suspended); } /* * This function removes sios from an IO queue which reside within a given * zfs_range_seg_t and inserts them (in offset order) into a list. Note that * we only ever return a maximum of 32 sios at once. If there are more sios * to process within this segment that did not make it onto the list we * return B_TRUE and otherwise B_FALSE. */ static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, zfs_range_seg_t *rs, list_t *list) { scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; SIO_SET_OFFSET(srch_sio, zfs_rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); while (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs, queue->q_exts_by_addr) && num_sios <= 32) { ASSERT3U(SIO_GET_OFFSET(sio), >=, zfs_rs_get_start(rs, queue->q_exts_by_addr)); ASSERT3U(SIO_GET_END_OFFSET(sio), <=, zfs_rs_get_end(rs, queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); if (avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&queue->q_scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; } /* * We limit the number of sios we process at once to 32 to avoid * biting off more than we can chew. If we didn't take everything * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ if (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs, queue->q_exts_by_addr)) { zfs_range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); zfs_range_tree_resize_segment(queue->q_exts_by_addr, rs, SIO_GET_OFFSET(sio), zfs_rs_get_end(rs, queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); queue->q_last_ext_addr = SIO_GET_OFFSET(sio); return (B_TRUE); } else { uint64_t rstart = zfs_rs_get_start(rs, queue->q_exts_by_addr); uint64_t rend = zfs_rs_get_end(rs, queue->q_exts_by_addr); zfs_range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); queue->q_last_ext_addr = -1; return (B_FALSE); } } /* * This is called from the queue emptying thread and selects the next * extent from which we are to issue I/Os. The behavior of this function * depends on the state of the scan, the current memory consumption and * whether or not we are performing a scan shutdown. * 1) We select extents in an elevator algorithm (LBA-order) if the scan * needs to perform a checkpoint * 2) We select the largest available extent if we are up against the * memory limit. * 3) Otherwise we don't select any extents. */ static zfs_range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; zfs_range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); if (!scn->scn_checkpointing && !scn->scn_clearing) return (NULL); /* * During normal clearing, we want to issue our largest segments * first, keeping IO as sequential as possible, and leaving the * smaller extents for later with the hope that they might eventually * grow to larger sequential segments. However, when the scan is * checkpointing, no new extents will be added to the sorting queue, * so the way we are sorted now is as good as it will ever get. * In this case, we instead switch to issuing extents in LBA order. */ if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) || zfs_scan_issue_strategy == 1) return (zfs_range_tree_first(rt)); /* * Try to continue previous extent if it is not completed yet. After * shrink in scan_io_queue_gather() it may no longer be the best, but * otherwise we leave shorter remnant every txg. */ uint64_t start; uint64_t size = 1ULL << rt->rt_shift; zfs_range_seg_t *addr_rs; if (queue->q_last_ext_addr != -1) { start = queue->q_last_ext_addr; addr_rs = zfs_range_tree_find(rt, start, size); if (addr_rs != NULL) return (addr_rs); } /* * Nothing to continue, so find new best extent. */ uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL); if (v == NULL) return (NULL); queue->q_last_ext_addr = start = *v << rt->rt_shift; /* * We need to get the original entry in the by_addr tree so we can * modify it. */ addr_rs = zfs_range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); ASSERT3U(zfs_rs_get_start(addr_rs, rt), ==, start); ASSERT3U(zfs_rs_get_end(addr_rs, rt), >, start); return (addr_rs); } static void scan_io_queues_run_one(void *arg) { dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; zfs_range_seg_t *rs; scan_io_t *sio; zio_t *zio; list_t sio_list; ASSERT(queue->q_scn->scn_is_sorted); list_create(&sio_list, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_list_node)); zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); mutex_enter(q_lock); queue->q_zio = zio; /* Calculate maximum in-flight bytes for this vdev. */ queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit * (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd))); /* reset per-queue scan statistics for this txg */ queue->q_total_seg_size_this_txg = 0; queue->q_segs_this_txg = 0; queue->q_total_zio_size_this_txg = 0; queue->q_zios_this_txg = 0; /* loop until we run out of time or sios */ while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { uint64_t seg_start = 0, seg_end = 0; boolean_t more_left; ASSERT(list_is_empty(&sio_list)); /* loop while we still have sios left to process in this rs */ do { scan_io_t *first_sio, *last_sio; /* * We have selected which extent needs to be * processed next. Gather up the corresponding sios. */ more_left = scan_io_queue_gather(queue, rs, &sio_list); ASSERT(!list_is_empty(&sio_list)); first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the * queue lock. The sio queue won't be updated by * other threads since we're in syncing context so * we can be sure that our trees will remain exactly * as we left them. */ mutex_exit(q_lock); suspended = scan_io_queue_issue(queue, &sio_list); mutex_enter(q_lock); if (suspended) break; } while (more_left); /* update statistics for debugging purposes */ scan_io_queues_update_seg_stats(queue, seg_start, seg_end); if (suspended) break; } /* * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); queue->q_zio = NULL; mutex_exit(q_lock); zio_nowait(zio); list_destroy(&sio_list); } /* * Performs an emptying run on all scan queues in the pool. This just * punches out one thread per top-level vdev, each of which processes * only that vdev's scan queue. We can parallelize the I/O here because * we know that each queue's I/Os only affect its own top-level vdev. * * This function waits for the queue runs to complete, and must be * called from dsl_scan_sync (or in general, syncing context). */ static void scan_io_queues_run(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; ASSERT(scn->scn_is_sorted); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (scn->scn_queues_pending == 0) return; if (scn->scn_taskq == NULL) { int nthreads = spa->spa_root_vdev->vdev_children; /* * We need to make this taskq *always* execute as many * threads in parallel as we have top-level vdevs and no * less, otherwise strange serialization of the calls to * scan_io_queues_run_one can occur during spa_sync runs * and that significantly impacts performance. */ scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; mutex_enter(&vd->vdev_scan_io_queue_lock); if (vd->vdev_scan_io_queue != NULL) { VERIFY(taskq_dispatch(scn->scn_taskq, scan_io_queues_run_one, vd->vdev_scan_io_queue, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * Wait for the queues to finish issuing their IOs for this run * before we return. There may still be IOs in flight at this * point. */ taskq_wait(scn->scn_taskq); } static boolean_t dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (zfs_async_block_max_blocks != 0 && scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { return (B_TRUE); } if (zfs_max_async_dedup_frees != 0 && scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { return (B_TRUE); } elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; if (BP_GET_DEDUP(bp)) scn->scn_dedup_frees_this_txg++; return (0); } static void dsl_scan_update_stats(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t i; uint64_t seg_size_total = 0, zio_size_total = 0; uint64_t seg_count_total = 0, zio_count_total = 0; for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; if (queue == NULL) continue; seg_size_total += queue->q_total_seg_size_this_txg; zio_size_total += queue->q_total_zio_size_this_txg; seg_count_total += queue->q_segs_this_txg; zio_count_total += queue->q_zios_this_txg; } if (seg_count_total == 0 || zio_count_total == 0) { scn->scn_avg_seg_size_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_zios_this_txg = 0; return; } scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; scn->scn_segs_this_txg = seg_count_total; scn->scn_zios_this_txg = zio_count_total; } static int bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (dsl_scan_free_block_cb(arg, bp, tx)); } static int dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } clones_left = spa_livelist_delete_check(spa); return ((used != 0) || (clones_left)); } boolean_t dsl_errorscrub_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if (dsl_errorscrubbing(scn->scn_dp)) return (B_TRUE); return (B_FALSE); } static boolean_t dsl_scan_check_deferred(vdev_t *vd) { boolean_t need_resilver = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) { need_resilver |= dsl_scan_check_deferred(vd->vdev_child[c]); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (need_resilver); if (!vd->vdev_resilver_deferred) need_resilver = B_TRUE; return (need_resilver); } static boolean_t dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_t *vd; vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops == &vdev_indirect_ops) { /* * The indirect vdev can point to multiple * vdevs. For simplicity, always create * the resilver zio_t. zio_vdev_io_start() * will bypass the child resilver i/o's if * they are on vdevs that don't have DTL's. */ return (B_TRUE); } if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ return (B_TRUE); } /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* * Check that this top-level vdev has a device under it which * is resilvering and is not deferred. */ if (!dsl_scan_check_deferred(vd)) return (B_FALSE); return (B_TRUE); } static int dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; if (spa_suspend_async_destroy(spa)) return (0); if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree on %s in txg %llu; err=%u", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), spa->spa_name, (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; /* * Write out changes to the DDT and the BRT that may be required * as a result of the blocks freed. This ensures that the DDT * and the BRT are clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); brt_sync(spa, tx->tx_txg); } if (err != 0) return (err); if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } spa_notify_waiters(spa); EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ)); if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)); scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; err = bpobj_iterate(&dp->dp_obsolete_bpobj, dsl_scan_obsolete_block_cb, scn, tx); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) dsl_pool_destroy_obsolete_bpobj(dp, tx); } return (0); } static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zb->zb_object = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } static void name_to_object(char *buf, uint64_t *obj) { *obj = zfs_strtonum(buf, &buf); ASSERT(*buf == '\0'); } static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) return; if (dmu_objset_from_ds(ds, &os) != 0) { dsl_dataset_rele(ds, FTAG); return; } /* * If the key is not loaded dbuf_dnode_findbp() will error out with * EACCES. However in that case dnode_hold() will eventually call * dbuf_read()->zio_wait() which may call spa_log_error(). This will * lead to a deadlock due to us holding the mutex spa_errlist_lock. * Avoid this by checking here if the keys are loaded, if not return. * If the keys are not loaded the head_errlog feature is meaningless * as we cannot figure out the birth txg of the block pointer. */ if (dsl_dataset_get_keystatus(ds->ds_dir) == ZFS_KEYSTATUS_UNAVAILABLE) { dsl_dataset_rele(ds, FTAG); return; } dnode_t *dn; blkptr_t bp; if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { dsl_dataset_rele(ds, FTAG); return; } rw_enter(&dn->dn_struct_rwlock, RW_READER); int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, NULL); if (error) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); return; } if (!error && BP_IS_HOLE(&bp)) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); return; } int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; /* If it's an intent log block, failure is expected. */ if (zb.zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; ASSERT(!BP_IS_EMBEDDED(&bp)); scan_exec_io(dp, &bp, zio_flags, &zb, NULL); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); } /* * We keep track of the scrubbed error blocks in "count". This will be used * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This * function is modelled after check_filesystem(). */ static int scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, int *count) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); if (error != 0) return (error); uint64_t latest_txg; uint64_t txg_to_consider = spa->spa_syncing_txg; boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); /* * If find_birth_txg() errors out, then err on the side of caution and * proceed. In worst case scenario scrub all objects. If zep->zb_birth * is 0 (e.g. in case of encryption with unloaded keys) also proceed to * scrub all objects. */ if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor re written. */ zbookmark_phys_t zb; zep_to_zb(fs, zep, &zb); scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* We have already acquired the config lock for spa */ read_by_block_level(scn, zb); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined++; scn->errorscrub_phys.dep_to_examine--; (*count)++; if ((*count) == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, &zb)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EFAULT)); } check_snapshot = B_FALSE; } else if (error == 0) { txg_to_consider = latest_txg; } /* * Retrieve the number of snapshots if the dataset is not a snapshot. */ uint64_t snap_count = 0; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { error = zap_count(spa->spa_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ dsl_dataset_rele(ds, FTAG); return (0); } uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele(ds, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); if (error != 0) return (error); if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele(ds, FTAG); continue; } boolean_t affected = B_TRUE; if (check_snapshot) { uint64_t blk_txg; error = find_birth_txg(ds, zep, &blk_txg); /* * Scrub the snapshot also when zb_birth == 0 or when * find_birth_txg() returns an error. */ affected = (error == 0 && zep->zb_birth == blk_txg) || (error != 0) || (zep->zb_birth == 0); } /* Scrub snapshots. */ if (affected) { zbookmark_phys_t zb; zep_to_zb(snap_obj, zep, &zb); scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* We have already acquired the config lock for spa */ read_by_block_level(scn, zb); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined++; scn->errorscrub_phys.dep_to_examine--; (*count)++; if ((*count) == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, &zb)) { dsl_dataset_rele(ds, FTAG); return (EFAULT); } } snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } return (0); } void dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; /* * Only process scans in sync pass 1. */ if (spa_sync_pass(spa) > 1) return; /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { return; } if (dsl_scan_resilvering(scn->scn_dp)) { /* cancel the error scrub if resilver started */ dsl_scan_cancel(scn->scn_dp); return; } spa->spa_scrub_active = B_TRUE; scn->scn_sync_start_time = gethrtime(); /* * zfs_scan_suspend_progress can be set to disable scrub progress. * See more detailed comment in dsl_scan_sync(). */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; int mintime = zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && !spa_shutting_down(scn->scn_dp->dp_spa) && NSEC2MSEC(scan_time_ns) < mintime) { delay(hz); scan_time_ns = gethrtime() - scn->scn_sync_start_time; } return; } int i = 0; zap_attribute_t *za; zbookmark_phys_t *zb; boolean_t limit_exceeded = B_FALSE; za = zap_attribute_alloc(); zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; zap_cursor_advance(&scn->errorscrub_cursor)) { name_to_bookmark(za->za_name, zb); scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_pool_config_enter(dp, FTAG); read_by_block_level(scn, *zb); dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined += 1; scn->errorscrub_phys.dep_to_examine -= 1; i++; if (i == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, zb)) { limit_exceeded = B_TRUE; break; } } if (!limit_exceeded) dsl_errorscrub_done(scn, B_TRUE, tx); dsl_errorscrub_sync_state(scn, tx); zap_attribute_free(za); kmem_free(zb, sizeof (*zb)); return; } int error = 0; for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; zap_cursor_advance(&scn->errorscrub_cursor)) { zap_cursor_t *head_ds_cursor; zap_attribute_t *head_ds_attr; zbookmark_err_phys_t head_ds_block; head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); head_ds_attr = zap_attribute_alloc(); uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; name_to_object(za->za_name, &head_ds); boolean_t config_held = B_FALSE; uint64_t top_affected_fs; for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { name_to_errphys(head_ds_attr->za_name, &head_ds_block); /* * In case we are called from spa_sync the pool * config is already held. */ if (!dsl_pool_config_held(dp)) { dsl_pool_config_enter(dp, FTAG); config_held = B_TRUE; } error = find_top_affected_fs(spa, head_ds, &head_ds_block, &top_affected_fs); if (error) break; error = scrub_filesystem(spa, top_affected_fs, &head_ds_block, &i); if (error == SET_ERROR(EFAULT)) { limit_exceeded = B_TRUE; break; } } zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); zap_attribute_free(head_ds_attr); if (config_held) dsl_pool_config_exit(dp, FTAG); } zap_attribute_free(za); kmem_free(zb, sizeof (*zb)); if (!limit_exceeded) dsl_errorscrub_done(scn, B_TRUE, tx); dsl_errorscrub_sync_state(scn, tx); } /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we * can guarantee that blocks we are currently scanning will not change out * from under us. While a scan is active, this function controls how quickly * transaction groups proceed, instead of the normal handling provided by * txg_sync_thread(). */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; state_sync_type_t sync_type = SYNC_OPTIONAL; int restart_early = 0; if (spa->spa_resilver_deferred) { uint64_t to_issue, issued; if (!spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx); /* * See print_scan_scrub_resilver_status() issued/total_i * @ cmd/zpool/zpool_main.c */ to_issue = scn->scn_phys.scn_to_examine - scn->scn_phys.scn_skipped; issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; restart_early = zfs_resilver_disable_defer || (issued < (to_issue * zfs_resilver_defer_percent / 100)); } /* * Only process scans in sync pass 1. */ if (spa_sync_pass(spa) > 1) return; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). We also restart scans if there * is a deferred resilver and the user has manually disabled * deferred resilvers via zfs_resilver_disable_defer, or if the * current scan progress is below zfs_resilver_defer_percent. */ if (dsl_scan_restarting(scn, tx) || restart_early) { setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) setup_sync_arg.func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d", setup_sync_arg.func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg, restart_early); dsl_scan_setup_sync(&setup_sync_arg, tx); } /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; /* * If the scan is inactive due to a stalled async destroy, try again. */ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; /* reset scan statistics */ scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; scn->scn_ddt_contained_this_txg = 0; scn->scn_objsets_visited_this_txg = 0; scn->scn_avg_seg_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we suspend, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ err = dsl_process_async_destroys(dp, tx); if (err != 0) return; if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. */ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; /* * zfs_scan_suspend_progress can be set to disable scan progress. * We don't want to spin the txg_sync thread, so we add a delay * here to simulate the time spent doing a scan. This is mostly * useful for testing and debugging. */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && !spa_shutting_down(scn->scn_dp->dp_spa) && NSEC2MSEC(scan_time_ns) < mintime) { delay(hz); scan_time_ns = gethrtime() - scn->scn_sync_start_time; } return; } /* * Disabled by default, set zfs_scan_report_txgs to report * average performance over the last zfs_scan_report_txgs TXGs. */ if (zfs_scan_report_txgs != 0 && tx->tx_txg % zfs_scan_report_txgs == 0) { scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); } /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. */ if (!zfs_scan_legacy) { scn->scn_is_sorted = B_TRUE; if (scn->scn_last_checkpoint == 0) scn->scn_last_checkpoint = ddi_get_lbolt(); } /* * For sorted scans, determine what kind of work we will be doing * this txg based on our memory limitations and whether or not we * need to perform a checkpoint. */ if (scn->scn_is_sorted) { /* * If we are over our checkpoint interval, set scn_clearing * so that we can begin checkpointing immediately. The * checkpoint allows us to save a consistent bookmark * representing how much data we have scrubbed so far. * Otherwise, use the memory limit to determine if we should * scan for metadata or start issue scrub IOs. We accumulate * metadata until we hit our hard memory limit at which point * we issue scrub IOs until we are at our soft memory limit. */ if (scn->scn_checkpointing || ddi_get_lbolt() - scn->scn_last_checkpoint > SEC_TO_TICK(zfs_scan_checkpoint_intval)) { if (!scn->scn_checkpointing) zfs_dbgmsg("begin scan checkpoint for %s", spa->spa_name); scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } else { boolean_t should_clear = dsl_scan_should_clear(scn); if (should_clear && !scn->scn_clearing) { zfs_dbgmsg("begin scan clearing for %s", spa->spa_name); scn->scn_clearing = B_TRUE; } else if (!should_clear && scn->scn_clearing) { zfs_dbgmsg("finish scan clearing for %s", spa->spa_name); scn->scn_clearing = B_FALSE; } } } else { ASSERT0(scn->scn_checkpointing); ASSERT0(scn->scn_clearing); } if (!scn->scn_clearing && scn->scn_done_txg == 0) { /* Need to scan metadata for more blocks to scrub */ dsl_scan_phys_t *scnp = &scn->scn_phys; taskqid_t prefetch_tqid; /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). * Limits for the issuing phase are done per top-level vdev and * are handled separately. */ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); zfs_dbgmsg("doing scan sync for %s txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } else { zfs_dbgmsg("doing scan sync for %s txg %llu; " "bm=%llu/%llu/%llu/%llu", spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_bookmark.zb_objset, (longlong_t)scnp->scn_bookmark.zb_object, (longlong_t)scnp->scn_bookmark.zb_level, (longlong_t)scnp->scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scn->scn_prefetch_stop = B_FALSE; prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, dsl_scan_prefetch_thread, scn, TQ_SLEEP); ASSERT(prefetch_tqid != TASKQID_INVALID); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); mutex_enter(&dp->dp_spa->spa_scrub_lock); scn->scn_prefetch_stop = B_TRUE; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&dp->dp_spa->spa_scrub_lock); taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("scan visited %llu blocks of %s in %llums " "(%llu os's, %llu holes, %llu < mintxg, " "%llu in ddt, %llu > maxtxg)", (longlong_t)scn->scn_visited_this_txg, spa->spa_name, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_objsets_visited_this_txg, (longlong_t)scn->scn_holes_this_txg, (longlong_t)scn->scn_lt_min_this_txg, (longlong_t)scn->scn_ddt_contained_this_txg, (longlong_t)scn->scn_gt_max_this_txg); if (!scn->scn_suspending) { ASSERT0(avl_numnodes(&scn->scn_queue)); scn->scn_done_txg = tx->tx_txg + 1; if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete for %s txg %llu", spa->spa_name, (longlong_t)tx->tx_txg); } } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) { ASSERT(scn->scn_clearing); /* need to issue scrubbing IOs from per-vdev queues */ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scan_io_queues_run(scn); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; /* calculate and dprintf the current memory usage */ (void) dsl_scan_should_clear(scn); dsl_scan_update_stats(scn); zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) " "in %llums (avg_block_size = %llu, avg_seg_size = %llu)", (longlong_t)scn->scn_zios_this_txg, spa->spa_name, (longlong_t)scn->scn_segs_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_avg_zio_size_this_txg, (longlong_t)scn->scn_avg_seg_size_this_txg); } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { /* Finished with everything. Mark the scrub as complete */ zfs_dbgmsg("scan issuing complete txg %llu for %s", (longlong_t)tx->tx_txg, spa->spa_name); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); ASSERT0(scn->scn_queues_pending); dsl_scan_done(scn, B_TRUE, tx); sync_type = SYNC_MANDATORY; } dsl_scan_sync_state(scn, tx, sync_type); } static void count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) { /* * Don't count embedded bp's, since we already did the work of * scanning these when we scanned the containing block. */ if (BP_IS_EMBEDDED(bp)) return; /* * Update the spa's stats on how many bytes we have issued. * Sequential scrubs create a zio for each DVA of the bp. Each * of these will include all DVAs for repair purposes, but the * zio code will only try the first one unless there is an issue. * Therefore, we should only count the first DVA for these IOs. */ atomic_add_64(&spa->spa_scan_pass_issued, all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } static void count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) { if (BP_IS_EMBEDDED(bp)) return; atomic_add_64(&scn->scn_phys.scn_skipped, all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (unlikely(avl_is_empty(&queue->q_sios_by_addr))) atomic_add_64(&scn->scn_queues_pending, 1); if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); queue->q_sio_memused += SIO_GET_MUSED(sio); zfs_range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)); } /* * Given all the info we got from our metadata scanning process, we * construct a scan_io_t and insert it into the scan sorting queue. The * I/O must already be suitable for us to process. This is controlled * by dsl_scan_enqueue(). */ static void scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); bp2sio(bp, sio, dva_i); sio->sio_flags = zio_flags; sio->sio_zb = *zb; queue->q_last_ext_addr = -1; scan_io_queue_insert_impl(queue, sio); } /* * Given a set of I/O parameters as discovered by the metadata traversal * process, attempts to place the I/O into the sorted queues (if allowed), * or immediately executes the I/O. */ static void dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb) { spa_t *spa = dp->dp_spa; ASSERT(!BP_IS_EMBEDDED(bp)); /* * Gang blocks are hard to issue sequentially, so we just issue them * here immediately instead of queuing them. */ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { scan_exec_io(dp, bp, zio_flags, zb, NULL); return; } for (int i = 0; i < BP_GET_NDVAS(bp); i++) { dva_t dva; vdev_t *vdev; dva = bp->blk_dva[i]; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); ASSERT(vdev != NULL); mutex_enter(&vdev->vdev_scan_io_queue_lock); if (vdev->vdev_scan_io_queue == NULL) vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); ASSERT(dp->dp_scan != NULL); scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, i, zio_flags, zb); mutex_exit(&vdev->vdev_scan_io_queue_lock); } } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_GET_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { count_block_skipped(scn, bp, B_TRUE); return (0); } /* Embedded BP's have phys_birth==0, so we reject them above. */ ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(8) status can make useful progress reports. */ uint64_t asize = DVA_GET_ASIZE(dva); scn->scn_phys.scn_examined += asize; spa->spa_scan_pass_exam += asize; /* if it's a resilver, this may not be in the target range */ if (!needs_io) needs_io = dsl_scan_need_resilver(spa, dva, psize, phys_birth); } if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { count_block_skipped(scn, bp, B_TRUE); } /* do not relocate this block */ return (0); } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; dsl_scan_io_queue_t *queue = zio->io_private; abd_free(zio->io_abd); if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } else { mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); queue->q_inflight_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&queue->q_zio_cv); mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); } if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { if (dsl_errorscrubbing(spa->spa_dsl_pool) && !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { atomic_inc_64(&spa->spa_dsl_pool->dp_scan ->errorscrub_phys.dep_errors); } else { atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys .scn_errors); } } } /* * Given a scanning zio's information, executes the zio. The zio need * not necessarily be only sortable, this function simply executes the * zio, no matter what it is. The optional queue argument allows the * caller to specify that they want per top level vdev IO rate limiting * instead of the legacy global limiting. */ static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); zio_t *pio; if (queue == NULL) { ASSERT3U(scn->scn_maxinflight_bytes, >, 0); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); pio = scn->scn_zio_root; } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; ASSERT3U(queue->q_maxinflight_bytes, >, 0); mutex_enter(q_lock); while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); queue->q_inflight_bytes += BP_GET_PSIZE(bp); pio = queue->q_zio; mutex_exit(q_lock); } ASSERT(pio != NULL); count_block_issued(spa, bp, queue == NULL); zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* * This is the primary extent sorting algorithm. We balance two parameters: * 1) how many bytes of I/O are in an extent * 2) how well the extent is filled with I/O (as a fraction of its total size) * Since we allow extents to have gaps between their constituent I/Os, it's * possible to have a fairly large extent that contains the same amount of * I/O bytes than a much smaller extent, which just packs the I/O more tightly. * The algorithm sorts based on a score calculated from the extent's size, * the relative fill volume (in %) and a "fill weight" parameter that controls * the split between whether we prefer larger extents or more well populated * extents: * * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) * * Example: * 1) assume extsz = 64 MiB * 2) assume fill = 32 MiB (extent is half full) * 3) assume fill_weight = 3 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 * SCORE = 32M + (50 * 3 * 32M) / 100 * SCORE = 32M + (4800M / 100) * SCORE = 32M + 48M * ^ ^ * | +--- final total relative fill-based score * +--------- final total fill-based score * SCORE = 80M * * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). * * Since we do not care if one extent is only few percent better than another, * compress the score into 6 bits via binary logarithm AKA highbit64() and * put into otherwise unused due to ashift high bits of offset. This allows * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ __attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { const uint64_t *a = x, *b = y; return (TREE_CMP(*a, *b)); } ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, ext_size_compare) static void ext_size_create(zfs_range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, sizeof (uint64_t)); } static void ext_size_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; ASSERT0(zfs_btree_numnodes(size_tree)); zfs_btree_destroy(size_tree); } static uint64_t -ext_size_value(zfs_range_tree_t *rt, range_seg_gap_t *rsg) +ext_size_value(zfs_range_tree_t *rt, zfs_range_seg_gap_t *rsg) { (void) rt; uint64_t size = rsg->rs_end - rsg->rs_start; uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) * fill_weight * rsg->rs_fill) >> 7); ASSERT3U(rt->rt_shift, >=, 8); return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start); } static void ext_size_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); - uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs); zfs_btree_add(size_tree, &v); } static void ext_size_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); - uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs); zfs_btree_remove(size_tree, &v); } static void ext_size_vacate(zfs_range_tree_t *rt, void *arg) { zfs_btree_t *size_tree = arg; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); ext_size_create(rt, arg); } static const zfs_range_tree_ops_t ext_size_ops = { .rtop_create = ext_size_create, .rtop_destroy = ext_size_destroy, .rtop_add = ext_size_add, .rtop_remove = ext_size_remove, .rtop_vacate = ext_size_vacate }; /* * Comparator for the q_sios_by_addr tree. Sorting is simply performed * based on LBA-order (from lowest to highest). */ static int sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ static dsl_scan_io_queue_t * scan_io_queue_create(vdev_t *vd) { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); q->q_scn = scn; q->q_vd = vd; q->q_sio_memused = 0; q->q_last_ext_addr = -1; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = zfs_range_tree_create_gap(&ext_size_ops, ZFS_RANGE_SEG_GAP, &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); } /* * Destroys a scan queue and all segments and scan_io_t's contained in it. * No further execution of I/O occurs, anything pending in the queue is * simply freed without being executed. */ void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; void *cookie = NULL; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (!avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&scn->scn_queues_pending, -1); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); queue->q_sio_memused -= SIO_GET_MUSED(sio); sio_free(sio); } ASSERT0(queue->q_sio_memused); zfs_range_tree_vacate(queue->q_exts_by_addr, NULL, queue); zfs_range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); cv_destroy(&queue->q_zio_cv); kmem_free(queue, sizeof (*queue)); } /* * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is * called on behalf of vdev_top_transfer when creating or destroying * a mirror vdev due to zpool attach/detach. */ void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) { mutex_enter(&svd->vdev_scan_io_queue_lock); mutex_enter(&tvd->vdev_scan_io_queue_lock); VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; if (tvd->vdev_scan_io_queue != NULL) tvd->vdev_scan_io_queue->q_vd = tvd; mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&svd->vdev_scan_io_queue_lock); } static void scan_io_queues_destroy(dsl_scan_t *scn) { vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; mutex_enter(&tvd->vdev_scan_io_queue_lock); if (tvd->vdev_scan_io_queue != NULL) dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); tvd->vdev_scan_io_queue = NULL; mutex_exit(&tvd->vdev_scan_io_queue_lock); } } static void dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); ASSERT(vdev != NULL); q_lock = &vdev->vdev_scan_io_queue_lock; queue = vdev->vdev_scan_io_queue; mutex_enter(q_lock); if (queue == NULL) { mutex_exit(q_lock); return; } srch_sio = sio_alloc(BP_GET_NDVAS(bp)); bp2sio(bp, srch_sio, dva_i); start = SIO_GET_OFFSET(srch_sio); size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: * 1) Cold, just sitting in the queue of zio's to be issued at * some point in the future. In this case, all we do is * remove the zio from the q_sios_by_addr tree, decrement * its data volume from the containing zfs_range_seg_t and * resort the q_exts_by_size tree to reflect that the * zfs_range_seg_t has lost some of its 'fill'. We don't shorten * the zfs_range_seg_t - this is usually rare enough not to be * worth the extra hassle of trying keep track of precise * extent boundaries. * 2) Hot, where the zio is currently in-flight in * dsl_scan_issue_ios. In this case, we can't simply * reach in and stop the in-flight zio's, so we instead * block the caller. Eventually, dsl_scan_issue_ios will * be done with issuing the zio's it gathered and will * signal us. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio != NULL) { blkptr_t tmpbp; /* Got it while it was cold in the queue */ ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, SIO_GET_ASIZE(sio)); avl_remove(&queue->q_sios_by_addr, sio); if (avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, start, size)); zfs_range_tree_remove_fill(queue->q_exts_by_addr, start, size); /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); count_block_skipped(scn, &tmpbp, B_FALSE); sio_free(sio); } mutex_exit(q_lock); } /* * Callback invoked when a zio_free() zio is executing. This needs to be * intercepted to prevent the zio from deallocating a particular portion * of disk space and it then getting reallocated and written to, while we * still have it queued up for processing. */ void dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(scn != NULL); if (!dsl_scan_is_running(scn)) return; for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } /* * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has * not started, start it. Otherwise, only restart if max txg in DTL range is * greater than the max txg in the current scan. If the DTL max is less than * the scan max, then the vdev has not missed any new data since the resilver * started, so a restart is not needed. */ void dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) { uint64_t min, max; if (!vdev_resilver_needed(vd, &min, &max)) return; if (!dsl_scan_resilvering(dp)) { spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); return; } if (max <= dp->dp_scan->scn_phys.scn_max_txg) return; /* restart is needed, check if it can be deferred */ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) vdev_defer_resilver(vd); else spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); } ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW, "Min millisecs to scrub per txg"); ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW, "Min millisecs to obsolete per txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW, "Min millisecs to free per txg"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW, "Min millisecs to resilver per txg"); ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, "Set to prevent scans from progressing"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, "Set to disable scrub I/O"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, "Set to disable scrub prefetching"); ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, "Max number of blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, "Max number of dedup blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, "Enable processing of the free_bpobj"); ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW, "Enable block statistics calculation during scrub"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW, "Fraction of RAM for scan hard limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW, "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, "Scrub using legacy non-sequential method"); ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW, "Fraction of hard limit used as soft limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, "Tunable to attempt to reduce lock contention"); ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, "Tunable to report resilver performance over the last N txgs"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW, "Issued IO percent complete after which resilvers are deferred"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, "Error blocks to be scrubbed in one txg"); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 10546798824a..e3c9afbd6e41 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1,6302 +1,6302 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal * operation, we will try to write this amount of data to each disk before * moving on to the next top-level vdev. */ static uint64_t metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * Of blocks of size >= metaslab_force_ganging, actually gang them this often. */ uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmentation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ static uint_t zfs_metaslab_fragmentation_threshold = 77; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this * bucket, and therefore a lower weight). */ static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; kstat_named_t metaslabstat_reload_tree; kstat_named_t metaslabstat_too_many_tries; kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, { "too_many_tries", KSTAT_DATA_UINT64 }, { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (metaslab_ksp != NULL) { metaslab_ksp->ks_data = &metaslab_stats; kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { if (metaslab_ksp != NULL) { kstat_delete(metaslab_ksp); metaslab_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; mca->mca_rotor = NULL; zfs_refcount_create_tracked(&mca->mca_alloc_slots); } return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; ASSERT(mca->mca_rotor == NULL); zfs_refcount_destroy(&mca->mca_alloc_slots); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count])); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; - mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); - kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); + kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { mutex_enter(&msp->ms_lock); /* * If the metaslab has been removed from the list * (which could happen if we were at the memory limit * and it was evicted during this loop), then we can't * proceed and we should restart the sublist. */ if (!multilist_link_active(&msp->ms_class_txg_node)) { mutex_exit(&msp->ms_lock); i--; break; } mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && now > msp->ms_selected_time + MSEC2NSEC(metaslab_unload_delay_ms) && (msp->ms_allocator == -1 || !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* * Once we've hit a metaslab selected too * recently to evict, we're done evicting for * now. */ mutex_exit(&msp->ms_lock); break; } mutex_exit(&msp->ms_lock); msp = next_msp; } } } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(offsetof(metaslab_group_t, mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } for (int i = 0; i < spa->spa_alloc_count; i++) { mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mgnext = NULL; } else { mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } for (int i = 0; i < spa->spa_alloc_count; i++) { if (mc->mc_allocator[i].mca_rotor == mg) mc->mc_allocator[i].mca_rotor = mgnext; } mg->mg_prev = NULL; mg->mg_next = NULL; } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { /* * Note that the number of nodes in mg_metaslab_tree may be one less * than vdev_ms_count, due to the embedded log metaslab. */ mutex_enter(&mg->mg_lock); uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); mutex_exit(&mg->mg_lock); return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; - mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); - ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); for (metaslab_t *msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { VERIFY3P(msp->ms_group, ==, mg); /* skip if not active */ if (msp->ms_sm == NULL) continue; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } } - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); - kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); + kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; if (msp->ms_group != mg) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, int flags, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if ((mc != spa_normal_class(spa) && mc != spa_special_class(spa) && mc != spa_dedup_class(spa)) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; int64_t qdepth; uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); /* * Some allocations (e.g., those coming from device removal * where the * allocations are not even counted in the * metaslab * allocation queues) are allowed to bypass * the throttle. */ if (flags & METASLAB_DONT_THROTTLE) return (B_TRUE); /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. */ qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); /* * If this metaslab group is below its qmax or it's * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (metaslab_group_t *mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { metaslab_group_allocator_t *mgap = &mgp->mg_allocator[allocator]; qmax = mgap->mga_cur_max_alloc_queue_depth; qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mgap->mga_alloc_queue_depth); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { - const range_seg32_t *r1 = x1; - const range_seg32_t *r2 = x2; + const zfs_range_seg32_t *r1 = x1; + const zfs_range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { - const range_seg64_t *r1 = x1; - const range_seg64_t *r2 = x2; + const zfs_range_seg64_t *r1 = x1; + const zfs_range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; struct mssa_arg { zfs_range_tree_t *rt; metaslab_rt_arg_t *mra; }; static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; zfs_range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; - range_seg_max_t seg = {0}; + zfs_range_seg_max_t seg = {0}; zfs_rs_set_start(&seg, rt, start); zfs_rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void metaslab_size_tree_full_load(zfs_range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg); } ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, - range_seg32_t, metaslab_rangesize32_compare) + zfs_range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, - range_seg64_t, metaslab_rangesize64_compare) + zfs_range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered zfs_range_tree_t and an array of * uint64_t's. */ static void metaslab_rt_create(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } static void metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } static void metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } static void metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } static const zfs_range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, .rtop_remove = metaslab_rt_remove, .rtop_vacate = metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; zfs_range_seg_t *rs; if (t == NULL) return (0); if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, msp->ms_allocatable)); } /* * Return the maximum contiguous segment within the unflushed frees of this * metaslab. */ static uint64_t metaslab_largest_unflushed_free(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_unflushed_frees == NULL) return (0); if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); /* * When a range is freed from the metaslab, that range is added to * both the unflushed frees and the deferred frees. While the block * will eventually be usable, if the metaslab were loaded the range * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE * txgs had passed. As a result, when attempting to estimate an upper * bound for the largest currently-usable free segment in the * metaslab, we need to not consider any ranges currently in the defer * trees. This algorithm approximates the largest available chunk in * the largest range in the unflushed_frees tree by taking the first * chunk. While this may be a poor estimate, it should only remain so * briefly and should eventually self-correct as frees are no longer * deferred. Similar logic applies to the ms_freed tree. See * metaslab_load() for more details. * * There are two primary sources of inaccuracy in this estimate. Both * are tolerated for performance reasons. The first source is that we * only check the largest segment for overlaps. Smaller segments may * have more favorable overlaps with the other trees, resulting in * larger usable chunks. Second, we only look at the first chunk in * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); rsize = start - rstart; } } uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; return (rsize); } static zfs_range_seg_t * metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_btree_index_t *where) { zfs_range_seg_t *rs; - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, start + size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { rs = zfs_btree_next(t, where, where); } return (rs); } /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking * for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) first_found = zfs_rs_get_start(rs, rt); while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = zfs_rs_get_start(rs, rt); if (offset + size <= zfs_rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } rs = zfs_btree_next(bt, &where, &where); count_searched++; } *cursor = 0; return (-1ULL); } static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { { "dynamic", metaslab_df_alloc }, { "cursor", metaslab_cf_alloc }, { "new-dynamic", metaslab_ndf_alloc }, }; static int spa_find_allocator_byname(const char *val) { int a = ARRAY_SIZE(metaslab_allocators) - 1; if (strcmp("new-dynamic", val) == 0) return (-1); /* remove when ndf is working */ for (; a >= 0; a--) { if (strcmp(val, metaslab_allocators[a].msop_name) == 0) return (a); } return (-1); } void spa_set_allocator(spa_t *spa, const char *allocator) { int a = spa_find_allocator_byname(allocator); if (a < 0) a = 0; spa->spa_active_allocator = a; zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name); } int spa_get_allocator(spa_t *spa) { return (spa->spa_active_allocator); } #if defined(_KERNEL) int param_set_active_allocator_common(const char *val) { char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; int a = spa_find_allocator_byname(val); if (a < 0) return (SET_ERROR(EINVAL)); zfs_active_allocator = metaslab_allocators[a].msop_name; return (0); } #endif metaslab_ops_t * metaslab_allocator(spa_t *spa) { int allocator = spa_get_allocator(spa); return (&metaslab_allocators[allocator]); } /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; zfs_range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } if (rs != NULL && zfs_rs_get_start(rs, rt) + size <= zfs_rs_get_end(rs, rt)) { offset = zfs_rs_get_start(rs, rt); *cursor = offset + size; } } return (offset); } /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) return (-1ULL); *cursor = zfs_rs_get_start(rs, rt); *cursor_end = zfs_rs_get_end(rs, rt); } offset = *cursor; *cursor += size; return (offset); } /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; zfs_range_seg_t *rs; - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); if (max_size < size) return (-1ULL); zfs_rs_set_start(&rsearch, rt, *cursor); zfs_rs_set_end(&rsearch, rt, *cursor + size); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; zfs_rs_set_start(&rsearch, rt, 0); zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { *cursor = zfs_rs_get_start(rs, rt) + size; return (zfs_rs_get_start(rs, rt)); } return (-1ULL); } /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ static void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } static unsigned int metaslab_idx_func(multilist_t *ml, void *arg) { metaslab_t *msp = arg; /* * ms_id values are allocated sequentially, so full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, zfs_range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + zfs_range_tree_space(msp->ms_unflushed_allocs) - zfs_range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += zfs_range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, zfs_range_tree_space(msp->ms_defer[0]) + zfs_range_tree_space(msp->ms_defer[1])); msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + zfs_range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, zfs_range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; - for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { zfs_range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; /* * This function is used for verification purposes and thus should * not introduce any side-effects/mutations on the system's state. * * Regardless of whether metaslab_weight() thinks this metaslab * should be active or not, we want to ensure that the actual weight * (and therefore the value of ms_weight) would be the same if it * was to be recalculated at this point. * * In addition we set the nodirty flag so metaslab_weight() does * not dirty the metaslab for future TXGs (e.g. when trying to * force condensing to upgrade the metaslab spacemaps). */ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } /* * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from * this class that was used longest ago, and attempt to unload it. We don't * want to spend too much time in this loop to prevent performance * degradation, and we expect that most of the time this operation will * succeed. Between that and the normal unloading processing during txg sync, * we expect this to keep the metaslab memory usage under control. */ static void metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); break; } metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); /* * If the metaslab is currently loading there are two * cases. If it's the metaslab we're evicting, we * can't continue on or we'll panic when we attempt to * recursively lock the mutex. If it's another * metaslab that's loading, it can be safely skipped, * since we know it's very new and therefore not a * good eviction candidate. We check later once the * lock is held that the metaslab is fully loaded * before actually unloading it. */ if (msp->ms_loading) { msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); continue; } /* * We can't unload metaslabs with no spacemap because * they're not ready to be unloaded yet. We can't * unload metaslabs with outstanding allocations * because doing so could cause the metaslab's weight * to decrease while it's unloaded, which violates an * invariant that we use to prevent unnecessary * loading. We also don't unload metaslabs that are * currently active because they are high-weight * metaslabs that are likely to be used in the near * future. */ mutex_enter(&msp->ms_lock); if (msp->ms_allocator == -1 && msp->ms_sm != NULL && msp->ms_allocating_total == 0) { metaslab_unload(msp); } mutex_exit(&msp->ms_lock); msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } #else (void) mc, (void) zfs_metaslab_mem_limit; #endif } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); metaslab_rt_arg_t *mrap; if (msp->ms_allocatable->rt_arg == NULL) { mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); } else { mrap = msp->ms_allocatable->rt_arg; msp->ms_allocatable->rt_ops = NULL; msp->ms_allocatable->rt_arg = NULL; } mrap->mra_bt = &msp->ms_allocatable_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); /* Now, populate the size-sorted tree. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; zfs_range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load * the metaslab from the spacemap. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ zfs_range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. */ ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_remove, msp->ms_allocatable); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); msp->ms_load_time = load_end; zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, unloaded time %llu ms, " "loading_time %lld ms, ms_max_size %llu, " "max size error %lld, " "old_weight %llx, new_weight %llx", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)zfs_range_tree_space(msp->ms_freed), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, (u_longlong_t)msp->ms_max_size - max_size, (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); /* * If we're loading a metaslab in the normal class, consider evicting * another one to keep our memory usage under the limit defined by the * zfs_metaslab_mem_limit tunable. */ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == msp->ms_group->mg_class) { metaslab_potentially_evict(msp->ms_group->mg_class); } int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * This can happen if a metaslab is selected for eviction (in * metaslab_potentially_evict) and then unloaded during spa_sync (via * metaslab_class_evict_old). */ if (!msp->ms_loaded) return; zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, weight %llx, " "selected txg %llu (%llu ms ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000, (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, (u_longlong_t)msp->ms_max_size); } /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } /* * We want to optimize the memory use of the per-metaslab range * trees. To do this, we store the segments in the range trees in * units of sectors, zero-indexing from the start of the metaslab. If * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; return (ZFS_RANGE_SEG32); } else { *shift = 0; *start = 0; return (ZFS_RANGE_SEG64); } } void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; msp->ms_selected_time = gethrtime(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; vdev_ops_t *ops = vd->vdev_ops; if (ops->vdev_op_metaslab_init != NULL) ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. For * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_SIZE; t++) { ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), metaslab_unflushed_dirty(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); /* * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); zfs_range_tree_destroy(msp->ms_allocatable); zfs_range_tree_destroy(msp->ms_freeing); zfs_range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_allocs); zfs_range_tree_destroy(msp->ms_checkpointing); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { zfs_range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); zfs_range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. * * This table defines 0% fragmented space using 512M segments. Using this value, * we derive the rest of the table. This table originally went up to 16MB, but * with larger recordsizes, larger ashifts, and use of raidz3, it is possible * to have significantly larger allocations than were previously possible. * Since the fragmentation value is never stored on disk, it is possible to * change these calculations in the future. */ static const int zfs_frag_table[] = { 100, /* 512B */ 99, /* 1K */ 97, /* 2K */ 93, /* 4K */ 88, /* 8K */ 83, /* 16K */ 77, /* 32K */ 71, /* 64K */ 64, /* 128K */ 57, /* 256K */ 50, /* 512K */ 43, /* 1M */ 36, /* 2M */ 29, /* 4M */ 23, /* 8M */ 17, /* 16M */ 12, /* 32M */ 7, /* 64M */ 3, /* 128M */ 1, /* 256M */ 0, /* 512M */ }; #define FRAGMENTATION_TABLE_SIZE \ (sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0]))) /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. We also skip marking this metaslab for * condensing if the caller has explicitly set nodirty. */ if (!nodirty && spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); - for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* * This case will usually but not always get caught by the checks below; * metaslabs can be loaded by various means, including the trim and * initialize code. Once that happens, without this check they are * allocatable even before they finish their first txg sync. */ if (unlikely(msp->ms_new)) return (B_FALSE); /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in * try_hard and it's been less than zfs_metaslab_max_size_cache_sec * seconds since the metaslab was unloaded. */ if (msp->ms_loaded || (msp->ms_max_size != 0 && !try_hard && gethrtime() < msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_set_fragmentation(msp, nodirty); /* * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. If the metaslab is * unloaded, we check if there's a larger free segment in the * unflushed frees. This is a lower bound on the largest allocatable * segment size. Coalescing of adjacent entries may reveal larger * allocatable segments, but we aren't aware of those until loading * the space map into a range tree. */ if (msp->ms_loaded) { msp->ms_max_size = metaslab_largest_allocatable(msp); } else { msp->ms_max_size = MAX(msp->ms_max_size, metaslab_largest_unflushed_free(msp)); } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) { ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(mg, msp, msp->ms_weight | activation_weight); return (0); } metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? &mga->mga_primary : &mga->mga_secondary); mutex_enter(&mg->mg_lock); if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort_impl(mg, msp, msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mga->mga_primary = NULL; } else { ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; metaslab_class_t *mc = msp->ms_group->mg_class; spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { zfs_range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% empty. * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain freed * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to add to * the condense_tree. Instead we first sync out an entry marking * everything as allocated, then the condense_tree and then the * ms_allocatable, in the condensed space map. While this is not * optimal, it is typically close to optimal and more importantly * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; zfs_range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], zfs_range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); zfs_range_tree_vacate(condense_tree, NULL, NULL); zfs_range_tree_destroy(condense_tree); zfs_range_tree_vacate(tmp_tree, NULL, NULL); zfs_range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } static void metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, B_TRUE); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa, B_TRUE); } void metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* update log space map summary */ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, ms_prev_flushed_dirty); spa_log_summary_add_flushed_metaslab(spa, dirty); /* cleanup obsolete logs if any */ spa_cleanup_old_sm_logs(spa, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || metaslab_unflushed_txg(msp) == 0) return; metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { ASSERT0(zfs_range_tree_space(alloctree)); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); ASSERT0(zfs_range_tree_space(msp->ms_trim)); return; } /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense, it's loaded and we're not beyond the final * dirty txg, we need to let it through. Not condensing beyond the * final dirty txg prevents an issue where metaslabs that need to be * condensed but were loaded for other reasons could cause a panic * here. By only checking the txg in that branch of the conditional, * we preserve the utility of the VERIFY statements in all other * cases. */ if (zfs_range_tree_is_empty(alloctree) && zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); if (metaslab_unflushed_txg(msp) == 0) metaslab_unflushed_add(msp, tx); else if (!metaslab_unflushed_dirty(msp)) metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); zfs_range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += zfs_range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, zfs_range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += zfs_range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += zfs_range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { zfs_range_tree_vacate(msp->ms_freeing, zfs_range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); zfs_range_tree_vacate(alloctree, NULL, NULL); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } static void metaslab_evict(metaslab_t *msp, uint64_t txg) { if (!msp->ms_loaded || msp->ms_disabled != 0) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(zfs_range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); if (!metaslab_debug_unload) metaslab_unload(msp); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; zfs_range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); if (msp->ms_new) { /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); VERIFY0(zfs_range_tree_space(msp->ms_freed)); } ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - zfs_range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = zfs_range_tree_space(msp->ms_freed) - zfs_range_tree_space(*defer_tree); } else { defer_delta -= zfs_range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { zfs_range_tree_walk(*defer_tree, zfs_range_tree_add, msp->ms_trim); if (!defer_allowed) { zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, msp->ms_trim); } } else { zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ zfs_range_tree_vacate(*defer_tree, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { zfs_range_tree_swap(&msp->ms_freed, defer_tree); } else { zfs_range_tree_vacate(msp->ms_freed, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; metaslab_class_allocator_t *mca = &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { atomic_inc_64(&mca->mca_alloc_max_slots); return; } cur = mga->mga_cur_max_alloc_queue_depth; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; zfs_range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); zfs_range_tree_remove(rt, start, size); zfs_range_tree_clear(msp->ms_trim, start, size); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!try_hard && tries > zfs_metaslab_find_max_tries) { METASLABSTAT_BUMP(metaslabstat_too_many_tries); return (NULL); } tries++; if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, or * hasn't gone through a metaslab_sync_done(), then skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; for (i = 0; i < d; i++) { if (want_unique && !metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } static void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mga->mga_primary != NULL) { msp = mga->mga_primary; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus it is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mga->mga_secondary != NULL) { msp = mga->mga_secondary; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, want_unique, asize, allocator, try_hard, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } metaslab_set_selected_txg(msp, txg); int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { uint64_t offset; offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID) mg = mg->mg_next; } else { mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mca->mca_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, flags, psize, allocator, d); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all * other vdevs. */ if (vd->vdev_state < VDEV_STATE_HEALTHY && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* * If we don't need to try hard, then require that the * block be on a different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. * * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; int64_t ratio; /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * * This basically introduces a zero-centered * bias towards the devices with the most * free space, while compensating for vdev * size differences. * * Examples: * vdev V1 = 16M/128M * vdev V2 = 16M/128M * ratio(V1) = 100% ratio(V2) = 100% * * vdev V1 = 16M/128M * vdev V2 = 64M/128M * ratio(V1) = 127% ratio(V2) = 72% * * vdev V1 = 16M/128M * vdev V2 = 64M/512M * ratio(V1) = 40% ratio(V2) = 160% */ ratio = (vs_free * mc->mc_alloc_groups * 100) / (mc_free + 1); mg->mg_bias = ((ratio - 100) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, perhaps do so now. */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || psize <= 1 << spa->spa_min_ashift)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); zfs_range_tree_add(msp->ms_checkpointing, offset, asize); } else { zfs_range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); zfs_range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { /* * The potential race between _count() and _add() is covered * by the allocator lock in most cases, or irrelevant due to * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. * * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } return (B_FALSE); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); zfs_range_tree_remove(msp->ms_allocatable, offset, size); zfs_range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); } multilist_sublist_unlock(mls); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner, (void) arg; if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { zfs_range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the * current allocating tree. */ zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, size); zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); if (unload) metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) { ms->ms_unflushed_dirty = dirty; } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } boolean_t metaslab_unflushed_dirty(metaslab_t *ms) { return (ms->ms_unflushed_dirty); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, "Load all metaslabs when pool is first opened"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, "Prevent metaslabs from being unloaded"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, "Max number of metaslabs per group to preload"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, "Enable metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are sometimes forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 3cbd5712e1d3..8bb9a0724e61 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -1,875 +1,875 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2019 by Delphix. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include /* * Range trees are tree-based data structures that can be used to * track free space or generally any space allocation information. * A range tree keeps track of individual segments and automatically * provides facilities such as adjacent extent merging and extent * splitting in response to range add/remove requests. * * A range tree starts out completely empty, with no segments in it. * Adding an allocation via zfs_range_tree_add to the range tree can either: * 1) create a new extent * 2) extend an adjacent extent * 3) merge two adjacent extents * Conversely, removing an allocation via zfs_range_tree_remove can: * 1) completely remove an extent * 2) shorten an extent (if the allocation was near one of its ends) * 3) split an extent into two extents, in effect punching a hole * * A range tree is also capable of 'bridging' gaps when adding * allocations. This is useful for cases when close proximity of * allocations is an important detail that needs to be represented * in the range tree. See zfs_range_tree_set_gap(). The default behavior * is not to bridge gaps (i.e. the maximum allowed gap size is 0). * * In order to traverse a range tree, use either the zfs_range_tree_walk() * or zfs_range_tree_vacate() functions. * * To obtain more accurate information on individual segment * operations that the range tree performs "under the hood", you can * specify a set of callbacks by passing a zfs_range_tree_ops_t structure * to the zfs_range_tree_create function. Any callbacks that are non-NULL * are then called at the appropriate times. * * The range tree code also supports a special variant of range trees * that can bridge small gaps between segments. This kind of tree is used * by the dsl scanning code to group I/Os into mostly sequential chunks to * optimize disk performance. The code here attempts to do this with as * little memory and computational overhead as possible. One limitation of * this implementation is that segments of range trees with gaps can only * support removing complete segments. */ static inline void zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt) { ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES); size_t size = 0; switch (rt->rt_type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); break; case ZFS_RANGE_SEG_GAP: - size = sizeof (range_seg_gap_t); + size = sizeof (zfs_range_seg_gap_t); break; default: __builtin_unreachable(); } memcpy(dest, src, size); } void zfs_range_tree_stat_verify(zfs_range_tree_t *rt) { zfs_range_seg_t *rs; zfs_btree_index_t where; - uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; + uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; ASSERT3U(hist[idx], !=, 0); } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", i, hist, (u_longlong_t)hist[i], (u_longlong_t)rt->rt_histogram[i]); } VERIFY3U(hist[i], ==, rt->rt_histogram[i]); } } static void zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs) { uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); rt->rt_histogram[idx]++; ASSERT3U(rt->rt_histogram[idx], !=, 0); } static void zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs) { uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); ASSERT3U(rt->rt_histogram[idx], !=, 0); rt->rt_histogram[idx]--; } __attribute__((always_inline)) inline static int zfs_range_tree_seg32_compare(const void *x1, const void *x2) { - const range_seg32_t *r1 = x1; - const range_seg32_t *r2 = x2; + const zfs_range_seg32_t *r1 = x1; + const zfs_range_seg32_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } __attribute__((always_inline)) inline static int zfs_range_tree_seg64_compare(const void *x1, const void *x2) { - const range_seg64_t *r1 = x1; - const range_seg64_t *r2 = x2; + const zfs_range_seg64_t *r1 = x1; + const zfs_range_seg64_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } __attribute__((always_inline)) inline static int zfs_range_tree_seg_gap_compare(const void *x1, const void *x2) { - const range_seg_gap_t *r1 = x1; - const range_seg_gap_t *r2 = x2; + const zfs_range_seg_gap_t *r1 = x1; + const zfs_range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, range_seg32_t, +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t, zfs_range_tree_seg32_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, range_seg64_t, +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t, zfs_range_tree_seg64_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, range_seg_gap_t, - zfs_range_tree_seg_gap_compare) +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, + zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare) zfs_range_tree_t * zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) { zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP); ASSERT3U(shift, <, 64); ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); compare = zfs_range_tree_seg32_compare; bt_find = zfs_range_tree_seg32_find_in_buf; break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); compare = zfs_range_tree_seg64_compare; bt_find = zfs_range_tree_seg64_find_in_buf; break; case ZFS_RANGE_SEG_GAP: - size = sizeof (range_seg_gap_t); + size = sizeof (zfs_range_seg_gap_t); compare = zfs_range_tree_seg_gap_compare; bt_find = zfs_range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; rt->rt_type = type; rt->rt_start = start; rt->rt_shift = shift; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } zfs_range_tree_t * zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0)); } void zfs_range_tree_destroy(zfs_range_tree_t *rt) { VERIFY0(rt->rt_space); if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, int64_t delta) { if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) { zfs_panic_recover("zfs: attempting to decrease fill to or " "below 0; probable double remove in segment [%llx:%llx]", (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) { zfs_panic_recover("zfs: attempting to increase fill beyond " "max; probable double add in segment [%llx:%llx]", (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } static void zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { zfs_range_tree_t *rt = arg; zfs_btree_index_t where; zfs_range_seg_t *rs_before, *rs_after, *rs; - range_seg_max_t tmp, rsearch; + zfs_range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); ASSERT3U(start + size, >, start); zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, end); rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we * are inserting into an existing segment. In this case simply * bump the fill count and call the remove / add callbacks. If the * new range will extend an existing segment, we remove the * existing one, apply the new extent to it and re-insert it using * the normal code paths. */ if (rs != NULL) { if (gap == 0) { zfs_panic_recover("zfs: adding existent segment to " "range tree (offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size); return; } uint64_t rstart = zfs_rs_get_start(rs, rt); uint64_t rend = zfs_rs_get_end(rs, rt); if (rstart <= start && rend >= end) { zfs_range_tree_adjust_fill(rt, rs, fill); return; } if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); zfs_range_tree_stat_decr(rt, rs); rt->rt_space -= rend - rstart; fill += zfs_rs_get_fill(rs, rt); start = MIN(start, rstart); end = MAX(end, rend); size = end - start; zfs_btree_remove(&rt->rt_root, rs); zfs_range_tree_add_impl(rt, start, size, fill); return; } ASSERT3P(rs, ==, NULL); /* * Determine whether or not we will have to merge with our neighbors. * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ zfs_btree_index_t where_before, where_after; rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >= start - gap); merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <= end + gap); if (merge_before && gap != 0) bridge_size += start - zfs_rs_get_end(rs_before, rt); if (merge_after && gap != 0) bridge_size += zfs_rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } zfs_range_tree_stat_decr(rt, rs_before); zfs_range_tree_stat_decr(rt, rs_after); zfs_rs_copy(rs_after, &tmp, rt); uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt); uint64_t before_fill = zfs_rs_get_fill(rs_before, rt); uint64_t after_fill = zfs_rs_get_fill(rs_after, rt); zfs_btree_remove_idx(&rt->rt_root, &where_before); /* * We have to re-find the node because our old reference is * invalid as soon as we do any mutating btree operations. */ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); ASSERT3P(rs_after, !=, NULL); zfs_rs_set_start_raw(rs_after, rt, before_start); zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); zfs_range_tree_stat_decr(rt, rs_before); uint64_t before_fill = zfs_rs_get_fill(rs_before, rt); zfs_rs_set_end(rs_before, rt, end); zfs_rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); zfs_range_tree_stat_decr(rt, rs_after); uint64_t after_fill = zfs_rs_get_fill(rs_after, rt); zfs_rs_set_start(rs_after, rt, start); zfs_rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { rs = &tmp; zfs_rs_set_start(rs, rt, start); zfs_rs_set_end(rs, rt, end); zfs_rs_set_fill(rs, rt, fill); zfs_btree_add_idx(&rt->rt_root, rs, &where); } if (gap != 0) { ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); } else { ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); zfs_range_tree_stat_incr(rt, rs); rt->rt_space += size + bridge_size; } void zfs_range_tree_add(void *arg, uint64_t start, uint64_t size) { zfs_range_tree_add_impl(arg, start, size, size); } static void zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { zfs_btree_index_t where; zfs_range_seg_t *rs; - range_seg_max_t rsearch, rs_tmp; + zfs_range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, end); rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { zfs_panic_recover("zfs: removing nonexistent segment from " "range tree (offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size); return; } /* * Range trees with gap support must only remove complete segments * from the tree. This allows us to maintain accurate fill accounting * and to ensure that bridged sections are not leaked. If we need to * remove less than the full segment, we can only adjust the fill count. */ if (rt->rt_gap != 0) { if (do_fill) { if (zfs_rs_get_fill(rs, rt) == size) { start = zfs_rs_get_start(rs, rt); end = zfs_rs_get_end(rs, rt); size = end - start; } else { zfs_range_tree_adjust_fill(rt, rs, -size); return; } } else if (zfs_rs_get_start(rs, rt) != start || zfs_rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llx size=%llx) of " "(offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size, (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); return; } } VERIFY3U(zfs_rs_get_start(rs, rt), <=, start); VERIFY3U(zfs_rs_get_end(rs, rt), >=, end); left_over = (zfs_rs_get_start(rs, rt) != start); right_over = (zfs_rs_get_end(rs, rt) != end); zfs_range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - range_seg_max_t newseg; + zfs_range_seg_max_t newseg; zfs_rs_set_start(&newseg, rt, end); zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt)); zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end); zfs_range_tree_stat_incr(rt, &newseg); // This modifies the buffer already inside the range tree zfs_rs_set_end(rs, rt, start); zfs_rs_copy(rs, &rs_tmp, rt); if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) zfs_btree_add_idx(&rt->rt_root, &newseg, &where); else zfs_btree_add(&rt->rt_root, &newseg); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { // This modifies the buffer already inside the range tree zfs_rs_set_end(rs, rt, start); zfs_rs_copy(rs, &rs_tmp, rt); } else if (right_over) { // This modifies the buffer already inside the range tree zfs_rs_set_start(rs, rt, end); zfs_rs_copy(rs, &rs_tmp, rt); } else { zfs_btree_remove_idx(&rt->rt_root, &where); rs = NULL; } if (rs != NULL) { /* * The fill of the leftover segment will always be equal to * the size, since we do not support removing partial segments * of range trees with gaps. */ zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); } rt->rt_space -= size; } void zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size) { zfs_range_tree_remove_impl(arg, start, size, B_FALSE); } void zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { zfs_range_tree_remove_impl(rt, start, size, B_TRUE); } void zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs, uint64_t newstart, uint64_t newsize) { int64_t delta = newsize - (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); zfs_range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); zfs_rs_set_start(rs, rt, newstart); zfs_rs_set_end(rs, rt, newstart + newsize); zfs_range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); rt->rt_space += delta; } static zfs_range_seg_t * zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, end); return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } zfs_range_seg_t * zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size); if (rs != NULL && zfs_rs_get_start(rs, rt) <= start && zfs_rs_get_end(rs, rt) >= start + size) { return (rs); } return (NULL); } void zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off, uint64_t size) { zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size); if (rs != NULL) panic("segment already in tree; rs=%p", (void *)rs); } boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { return (zfs_range_tree_find(rt, start, size) != NULL); } /* * Returns the first subset of the given range which overlaps with the range * tree. Returns true if there is a segment in the range, and false if there * isn't. */ boolean_t zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize) { if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) + 1); zfs_btree_index_t where; zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; *osize = MIN(size, zfs_rs_get_end(rs, rt) - start); return (B_TRUE); } rs = zfs_btree_next(&rt->rt_root, &where, &where); if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size) return (B_FALSE); *ostart = zfs_rs_get_start(rs, rt); *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) - zfs_rs_get_start(rs, rt); return (B_TRUE); } /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. */ void zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { zfs_range_seg_t *rs; if (size == 0) return; if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) { uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start); uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size); zfs_range_tree_remove(rt, free_start, free_end - free_start); } } void zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst) { zfs_range_tree_t *rt; ASSERT0(zfs_range_tree_space(*rtdst)); ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; *rtdst = rt; } void zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, void *arg) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); if (func != NULL) { zfs_range_seg_t *rs; zfs_btree_index_t *cookie = NULL; while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); } } else { zfs_btree_clear(&rt->rt_root); } memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram)); rt->rt_space = 0; } void zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, void *arg) { zfs_btree_index_t where; for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); } } zfs_range_seg_t * zfs_range_tree_first(zfs_range_tree_t *rt) { return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t zfs_range_tree_space(zfs_range_tree_t *rt) { return (rt->rt_space); } uint64_t zfs_range_tree_numsegs(zfs_range_tree_t *rt) { return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); } boolean_t zfs_range_tree_is_empty(zfs_range_tree_t *rt) { ASSERT(rt != NULL); return (zfs_range_tree_space(rt) == 0); } /* * Remove any overlapping ranges between the given segment [start, end) * from removefrom. Add non-overlapping leftovers to addto. */ void zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, zfs_range_tree_t *removefrom, zfs_range_tree_t *addto) { zfs_btree_index_t where; - range_seg_max_t starting_rs; + zfs_range_seg_max_t starting_rs; zfs_rs_set_start(&starting_rs, removefrom, start); zfs_rs_set_end_raw(&starting_rs, removefrom, zfs_rs_get_start_raw(&starting_rs, removefrom) + 1); zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) curr = zfs_btree_next(&removefrom->rt_root, &where, &where); zfs_range_seg_t *next; for (; curr != NULL; curr = next) { if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ if (end <= zfs_rs_get_start(curr, removefrom)) { zfs_range_tree_add(addto, start, end - start); return; } uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom), start); uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom), end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); - range_seg_max_t rs; + zfs_range_seg_max_t rs; zfs_rs_copy(curr, &rs, removefrom); zfs_range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) zfs_range_tree_add(addto, start, overlap_start - start); start = overlap_end; next = zfs_btree_find(&removefrom->rt_root, &rs, &where); /* * If we find something here, we only removed part of the * curr segment. Either there's some left at the end * because we've reached the end of the range we're removing, * or there's some left at the start because we started * partway through the range. Either way, we continue with * the loop. If it's the former, we'll return at the start of * the loop, and if it's the latter we'll see if there is more * area to process. */ if (next != NULL) { ASSERT(start == end || start == zfs_rs_get_end(&rs, removefrom)); } next = zfs_btree_next(&removefrom->rt_root, &where, &where); } VERIFY3P(curr, ==, NULL); if (start != end) { VERIFY3U(start, <, end); zfs_range_tree_add(addto, start, end - start); } else { VERIFY3U(start, ==, end); } } /* * For each entry in rt, if it exists in removefrom, remove it * from removefrom. Otherwise, add it to addto. */ void zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt, zfs_range_tree_t *removefrom, zfs_range_tree_t *addto) { zfs_btree_index_t where; for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt), removefrom, addto); } } uint64_t zfs_range_tree_min(zfs_range_tree_t *rt) { zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0); } uint64_t zfs_range_tree_max(zfs_range_tree_t *rt) { zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0); } uint64_t zfs_range_tree_span(zfs_range_tree_t *rt) { return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt)); } diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index e9e03e05c86a..36e15b8d73af 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -1,1111 +1,1111 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Note on space map block size: * * The data for a given space map can be kept on blocks of any size. * Larger blocks entail fewer I/O operations, but they also cause the * DMU to keep more data in-core, and also to waste more I/O bandwidth * when only a few blocks have changed since the last transaction group. */ /* * Enabled whenever we want to stress test the use of double-word * space map entries. */ boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; /* * Override the default indirect block size of 128K, instead use 16K for * spacemaps (2^14 bytes). This dramatically reduces write inflation since * appending to a spacemap typically has to write one data block (4KB) and one * or two indirect blocks (16K-32K, rather than 128K). */ int space_map_ibs = 14; boolean_t sm_entry_is_debug(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); } boolean_t sm_entry_is_single_word(uint64_t e) { uint8_t prefix = SM_PREFIX_DECODE(e); return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); } boolean_t sm_entry_is_double_word(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM2_PREFIX); } /* * Iterate through the space map, invoking the callback on each (non-debug) * space map entry. Stop after reading 'end' bytes of the space map. */ int space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) { uint64_t blksz = sm->sm_blksz; ASSERT3U(blksz, !=, 0); ASSERT3U(end, <=, space_map_length(sm)); ASSERT0(P2PHASE(end, sizeof (uint64_t))); dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, ZIO_PRIORITY_SYNC_READ); int error = 0; uint64_t txg = 0, sync_pass = 0; for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; error = dmu_buf_hold(sm->sm_os, space_map_object(sm), block_base, FTAG, &db, DMU_READ_PREFETCH); if (error != 0) return (error); uint64_t *block_start = db->db_data; uint64_t block_length = MIN(end - block_base, blksz); uint64_t *block_end = block_start + (block_length / sizeof (uint64_t)); VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); VERIFY3U(block_length, !=, 0); ASSERT3U(blksz, ==, db->db_size); for (uint64_t *block_cursor = block_start; block_cursor < block_end && error == 0; block_cursor++) { uint64_t e = *block_cursor; if (sm_entry_is_debug(e)) { /* * Debug entries are only needed to record the * current TXG and sync pass if available. * * Note though that sometimes there can be * debug entries that are used as padding * at the end of space map blocks in-order * to not split a double-word entry in the * middle between two blocks. These entries * have their TXG field set to 0 and we * skip them without recording the TXG. * [see comment in space_map_write_seg()] */ uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); if (e_txg != 0) { txg = e_txg; sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); } else { ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); } continue; } uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { /* it is a two-word entry */ ASSERT(sm_entry_is_double_word(e)); raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move on to the second word */ block_cursor++; e = *block_cursor; VERIFY3P(block_cursor, <=, block_end); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); ASSERT3U(entry_offset, >=, sm->sm_start); ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); ASSERT3U(entry_run, <=, sm->sm_size); ASSERT3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run, .sme_txg = txg, .sme_sync_pass = sync_pass }; error = callback(&sme, arg); } dmu_buf_rele(db, FTAG); } return (error); } /* * Reads the entries from the last block of the space map into * buf in reverse order. Populates nwords with number of words * in the last block. * * Refer to block comment within space_map_incremental_destroy() * to understand why this function is needed. */ static int space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, uint64_t bufsz, uint64_t *nwords) { int error = 0; dmu_buf_t *db; /* * Find the offset of the last word in the space map and use * that to read the last block of the space map with * dmu_buf_hold(). */ uint64_t last_word_offset = sm->sm_phys->smp_length - sizeof (uint64_t); error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) return (error); ASSERT3U(sm->sm_object, ==, db->db_object); ASSERT3U(sm->sm_blksz, ==, db->db_size); ASSERT3U(bufsz, >=, db->db_size); ASSERT(nwords != NULL); uint64_t *words = db->db_data; *nwords = (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); uint64_t n = *nwords; uint64_t j = n - 1; for (uint64_t i = 0; i < n; i++) { uint64_t entry = words[i]; if (sm_entry_is_double_word(entry)) { /* * Since we are populating the buffer backwards * we have to be extra careful and add the two * words of the double-word entry in the right * order. */ ASSERT3U(j, >, 0); buf[j - 1] = entry; i++; ASSERT3U(i, <, n); entry = words[i]; buf[j] = entry; j -= 2; } else { ASSERT(sm_entry_is_debug(entry) || sm_entry_is_single_word(entry)); buf[j] = entry; j--; } } /* * Assert that we wrote backwards all the * way to the beginning of the buffer. */ ASSERT3S(j, ==, -1); dmu_buf_rele(db, FTAG); return (error); } /* * Note: This function performs destructive actions - specifically * it deletes entries from the end of the space map. Thus, callers * should ensure that they are holding the appropriate locks for * the space map that they provide. */ int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx) { uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); uint64_t *buf = zio_buf_alloc(bufsz); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * Ideally we would want to iterate from the beginning of the * space map to the end in incremental steps. The issue with this * approach is that we don't have any field on-disk that points * us where to start between each step. We could try zeroing out * entries that we've destroyed, but this doesn't work either as * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). * * As a result, we destroy its entries incrementally starting from * the end after applying the callback to each of them. * * The problem with this approach is that we cannot literally * iterate through the words in the space map backwards as we * can't distinguish two-word space map entries from their second * word. Thus we do the following: * * 1] We get all the entries from the last block of the space map * and put them into a buffer in reverse order. This way the * last entry comes first in the buffer, the second to last is * second, etc. * 2] We iterate through the entries in the buffer and we apply * the callback to each one. As we move from entry to entry we * we decrease the size of the space map, deleting effectively * each entry. * 3] If there are no more entries in the space map or the callback * returns a value other than 0, we stop iterating over the * space map. If there are entries remaining and the callback * returned 0, we go back to step [1]. */ int error = 0; while (space_map_length(sm) > 0 && error == 0) { uint64_t nwords = 0; error = space_map_reversed_last_block_entries(sm, buf, bufsz, &nwords); if (error != 0) break; ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); for (uint64_t i = 0; i < nwords; i++) { uint64_t e = buf[i]; if (sm_entry_is_debug(e)) { sm->sm_phys->smp_length -= sizeof (uint64_t); continue; } int words = 1; uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { ASSERT(sm_entry_is_double_word(e)); words = 2; raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move to the second word */ i++; e = buf[i]; ASSERT3P(i, <=, nwords); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); VERIFY3U(entry_offset, >=, sm->sm_start); VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); VERIFY3U(entry_run, <=, sm->sm_size); VERIFY3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run }; error = callback(&sme, arg); if (error != 0) break; if (type == SM_ALLOC) sm->sm_phys->smp_alloc -= entry_run; else sm->sm_phys->smp_alloc += entry_run; sm->sm_phys->smp_length -= words * sizeof (uint64_t); } } if (space_map_length(sm) == 0) { ASSERT0(error); ASSERT0(space_map_allocated(sm)); } zio_buf_free(buf, bufsz); return (error); } typedef struct space_map_load_arg { space_map_t *smla_sm; zfs_range_tree_t *smla_rt; maptype_t smla_type; } space_map_load_arg_t; static int space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; if (sme->sme_type == smla->smla_type) { VERIFY3U(zfs_range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); zfs_range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); } else { zfs_range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); } return (0); } /* * Load the spacemap into the rangetree, like space_map_load. But only * read the first 'length' bytes of the spacemap. */ int space_map_load_length(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t length) { space_map_load_arg_t smla; VERIFY0(zfs_range_tree_space(rt)); if (maptype == SM_FREE) zfs_range_tree_add(rt, sm->sm_start, sm->sm_size); smla.smla_rt = rt; smla.smla_sm = sm; smla.smla_type = maptype; int err = space_map_iterate(sm, length, space_map_load_callback, &smla); if (err != 0) zfs_range_tree_vacate(rt, NULL, NULL); return (err); } /* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. */ int space_map_load(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype) { return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); } void space_map_histogram_clear(space_map_t *sm) { if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; memset(sm->sm_phys->smp_histogram, 0, sizeof (sm->sm_phys->smp_histogram)); } boolean_t space_map_histogram_verify(space_map_t *sm, zfs_range_tree_t *rt) { /* * Verify that the in-core range tree does not have any * ranges smaller than our sm_shift size. */ for (int i = 0; i < sm->sm_shift; i++) { if (rt->rt_histogram[i] != 0) return (B_FALSE); } return (B_TRUE); } void space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(space_map_object(sm), !=, 0); if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; dmu_buf_will_dirty(sm->sm_dbuf, tx); ASSERT(space_map_histogram_verify(sm, rt)); /* * Transfer the content of the range tree histogram to the space * map histogram. The space map histogram contains 32 buckets ranging * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, * however, can represent ranges from 2^0 to 2^63. Since the space * map only cares about allocatable blocks (minimum of sm_shift) we * can safely ignore all ranges in the range tree smaller than sm_shift. */ - for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = sm->sm_shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { /* * Since the largest histogram bucket in the space map is * 2^(32+sm_shift-1), we need to normalize the values in * the range tree for any bucket larger than that size. For * example given an sm_shift of 9, ranges larger than 2^40 * would get normalized as if they were 1TB ranges. Assume * the range tree had a count of 5 in the 2^44 (16TB) bucket, * the calculation below would normalize this to 5 * 2^4 (16). */ ASSERT3U(i, >=, idx + sm->sm_shift); sm->sm_phys->smp_histogram[idx] += rt->rt_histogram[i] << (i - idx - sm->sm_shift); /* * Increment the space map's index as long as we haven't * reached the maximum bucket size. Accumulate all ranges * larger than the max bucket size into the last bucket. */ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + sm->sm_shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } static void space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) { dmu_buf_will_dirty(sm->sm_dbuf, tx); uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(maptype) | SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, sizeof (dentry), &dentry, tx); sm->sm_phys->smp_length += sizeof (dentry); } /* * Writes one or more entries given a segment. * * Note: The function may release the dbuf from the pointer initially * passed to it, and return a different dbuf. Also, the space map's * dbuf must be dirty for the changes in sm_phys to take effect. */ static void space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, const void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); /* ensure the vdev_id can be represented by the space map */ ASSERT3U(vdev_id, <=, SM_NO_VDEVID); /* * if this is a single word entry, ensure that no vdev was * specified. */ IMPLY(words == 1, vdev_id == SM_NO_VDEVID); dmu_buf_t *db = *dbp; ASSERT3U(db->db_size, ==, sm->sm_blksz); uint64_t *block_base = db->db_data; uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); uint64_t *block_cursor = block_base + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3P(block_cursor, <=, block_end); uint64_t size = (rend - rstart) >> sm->sm_shift; uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; ASSERT3U(rstart, >=, sm->sm_start); ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); ASSERT3U(rend - rstart, <=, sm->sm_size); ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); /* * If we are at the end of this block, flush it and start * writing again from the beginning. */ if (block_cursor == block_end) { dmu_buf_rele(db, tag); uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, tag, &db, DMU_READ_PREFETCH)); dmu_buf_will_dirty(db, tx); /* update caller's dbuf */ *dbp = db; ASSERT3U(db->db_size, ==, sm->sm_blksz); block_base = db->db_data; block_cursor = block_base; block_end = block_base + (db->db_size / sizeof (uint64_t)); } /* * If we are writing a two-word entry and we only have one * word left on this block, just pad it with an empty debug * entry and write the two-word entry in the next block. */ uint64_t *next_entry = block_cursor + 1; if (next_entry == block_end && words > 1) { ASSERT3U(words, ==, 2); *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(0) | SM_DEBUG_SYNCPASS_ENCODE(0) | SM_DEBUG_TXG_ENCODE(0); block_cursor++; sm->sm_phys->smp_length += sizeof (uint64_t); ASSERT3P(block_cursor, ==, block_end); continue; } uint64_t run_len = MIN(size, run_max); switch (words) { case 1: *block_cursor = SM_OFFSET_ENCODE(start) | SM_TYPE_ENCODE(maptype) | SM_RUN_ENCODE(run_len); block_cursor++; break; case 2: /* write the first word of the entry */ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | SM2_RUN_ENCODE(run_len) | SM2_VDEV_ENCODE(vdev_id); block_cursor++; /* move on to the second word of the entry */ ASSERT3P(block_cursor, <, block_end); *block_cursor = SM2_TYPE_ENCODE(maptype) | SM2_OFFSET_ENCODE(start); block_cursor++; break; default: panic("%d-word space map entries are not supported", words); break; } sm->sm_phys->smp_length += words * sizeof (uint64_t); start += run_len; size -= run_len; } ASSERT0(size); } /* * Note: The space map's dbuf must be dirty for the changes in sm_phys to * take effect. */ static void space_map_write_impl(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dmu_buf_t *db; space_map_write_intro_debug(sm, maptype, tx); #ifdef ZFS_DEBUG /* * We do this right after we write the intro debug entry * because the estimate does not take it into account. */ uint64_t initial_objsize = sm->sm_phys->smp_length; uint64_t estimated_growth = space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); uint64_t estimated_final_objsize = initial_objsize + estimated_growth; #endif /* * Find the offset right after the last word in the space map * and use that to get a hold of the last block, so we can * start appending to it. */ uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); ASSERT3U(db->db_size, ==, sm->sm_blksz); dmu_buf_will_dirty(db, tx); zfs_btree_t *t = &rt->rt_root; zfs_btree_index_t where; for (zfs_range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; rs = zfs_btree_next(t, &where, &where)) { uint64_t offset = (zfs_rs_get_start(rs, rt) - sm->sm_start) >> sm->sm_shift; uint64_t length = (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >> sm->sm_shift; uint8_t words = 1; /* * We only write two-word entries when both of the following * are true: * * [1] The feature is enabled. * [2] The offset or run is too big for a single-word entry, * or the vdev_id is set (meaning not equal to * SM_NO_VDEVID). * * Note that for purposes of testing we've added the case that * we write two-word entries occasionally when the feature is * enabled and zfs_force_some_double_word_sm_entries has been * set. */ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && (offset >= (1ULL << SM_OFFSET_BITS) || length > SM_RUN_MAX || vdev_id != SM_NO_VDEVID || (zfs_force_some_double_word_sm_entries && random_in_range(100) == 0))) words = 2; space_map_write_seg(sm, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt), maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); #ifdef ZFS_DEBUG /* * We expect our estimation to be based on the worst case * scenario [see comment in space_map_estimate_optimal_size()]. * Therefore we expect the actual objsize to be equal or less * than whatever we estimated it to be. */ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); #endif } /* * Note: This function manipulates the state of the given space map but * does not hold any locks implicitly. Thus the caller is responsible * for synchronizing writes to the space map. */ void space_map_write(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); VERIFY3U(space_map_object(sm), !=, 0); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * This field is no longer necessary since the in-core space map * now contains the object number but is maintained for backwards * compatibility. */ sm->sm_phys->smp_object = sm->sm_object; if (zfs_range_tree_is_empty(rt)) { VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; } if (maptype == SM_ALLOC) sm->sm_phys->smp_alloc += zfs_range_tree_space(rt); else sm->sm_phys->smp_alloc -= zfs_range_tree_space(rt); uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); uint64_t rt_space = zfs_range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); /* * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); VERIFY3U(zfs_range_tree_space(rt), ==, rt_space); } static int space_map_open_impl(space_map_t *sm) { int error; u_longlong_t blocks; error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); if (error) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); sm->sm_phys = sm->sm_dbuf->db_data; return (0); } int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift) { space_map_t *sm; int error; ASSERT(*smp == NULL); ASSERT(os != NULL); ASSERT(object != 0); sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP); sm->sm_start = start; sm->sm_size = size; sm->sm_shift = shift; sm->sm_os = os; sm->sm_object = object; sm->sm_blksz = 0; sm->sm_dbuf = NULL; sm->sm_phys = NULL; error = space_map_open_impl(sm); if (error != 0) { space_map_close(sm); return (error); } *smp = sm; return (0); } void space_map_close(space_map_t *sm) { if (sm == NULL) return; if (sm->sm_dbuf != NULL) dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_dbuf = NULL; sm->sm_phys = NULL; kmem_free(sm, sizeof (*sm)); } void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) { objset_t *os = sm->sm_os; spa_t *spa = dmu_objset_spa(os); dmu_object_info_t doi; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); dmu_object_info_from_db(sm->sm_dbuf, &doi); /* * If the space map has the wrong bonus size (because * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or * the wrong block size (because space_map_blksz has changed), * free and re-allocate its object with the updated sizes. * * Otherwise, just truncate the current object. */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || doi.doi_data_block_size != blocksize || doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating " "object[%llu]: old bonus %llu, old blocksz %u", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), sm, (u_longlong_t)sm->sm_object, (u_longlong_t)doi.doi_bonus_size, doi.doi_data_block_size); space_map_free(sm, tx); dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); VERIFY0(space_map_open_impl(sm)); } else { VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); /* * If the spacemap is reallocated, its histogram * will be reset. Do the same in the common case so that * bugs related to the uncommon case do not go unnoticed. */ memset(sm->sm_phys->smp_histogram, 0, sizeof (sm->sm_phys->smp_histogram)); } dmu_buf_will_dirty(sm->sm_dbuf, tx); sm->sm_phys->smp_length = 0; sm->sm_phys->smp_alloc = 0; } uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); uint64_t object; int bonuslen; if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); bonuslen = sizeof (space_map_phys_t); ASSERT3U(bonuslen, <=, dmu_bonus_max()); } else { bonuslen = SPACE_MAP_SIZE_V0; } object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, smobj, &doi)); if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { spa_feature_decr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } VERIFY0(dmu_object_free(os, smobj, tx)); } void space_map_free(space_map_t *sm, dmu_tx_t *tx) { if (sm == NULL) return; space_map_free_obj(sm->sm_os, space_map_object(sm), tx); sm->sm_object = 0; } /* * Given a range tree, it makes a worst-case estimate of how much * space would the tree's segments take if they were written to * the given space map. */ uint64_t space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, uint64_t vdev_id) { spa_t *spa = dmu_objset_spa(sm->sm_os); uint64_t shift = sm->sm_shift; uint64_t *histogram = rt->rt_histogram; uint64_t entries_for_seg = 0; /* * In order to get a quick estimate of the optimal size that this * range tree would have on-disk as a space map, we iterate through * its histogram buckets instead of iterating through its nodes. * * Note that this is a highest-bound/worst-case estimate for the * following reasons: * * 1] We assume that we always add a debug padding for each block * we write and we also assume that we start at the last word * of a block attempting to write a two-word entry. * 2] Rounding up errors due to the way segments are distributed * in the buckets of the range tree's histogram. * 3] The activation of zfs_force_some_double_word_sm_entries * (tunable) when testing. * * = Math and Rounding Errors = * * rt_histogram[i] bucket of a range tree represents the number * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given * that, we want to divide the buckets into groups: Buckets that * can be represented using a single-word entry, ones that can * be represented with a double-word entry, and ones that can * only be represented with multiple two-word entries. * * [Note that if the new encoding feature is not enabled there * are only two groups: single-word entry buckets and multiple * single-word entry buckets. The information below assumes * two-word entries enabled, but it can easily applied when * the feature is not enabled] * * To find the highest bucket that can be represented with a * single-word entry we look at the maximum run that such entry * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that * the run of a space map entry is shifted by sm_shift, thus we * add it to the exponent]. This way, excluding the value of the * maximum run that can be represented by a single-word entry, * all runs that are smaller exist in buckets 0 to * SM_RUN_BITS + shift - 1. * * To find the highest bucket that can be represented with a * double-word entry, we follow the same approach. Finally, any * bucket higher than that are represented with multiple two-word * entries. To be more specific, if the highest bucket whose * segments can be represented with a single two-word entry is X, * then bucket X+1 will need 2 two-word entries for each of its * segments, X+2 will need 4, X+3 will need 8, ...etc. * * With all of the above we make our estimation based on bucket * groups. There is a rounding error though. As we mentioned in * the example with the one-word entry, the maximum run that can * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of * that length fall into the next bucket (and bucket group) where * we start counting two-word entries and this is one more reason * why the estimated size may end up being bigger than the actual * size written. */ uint64_t size = 0; uint64_t idx = 0; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { /* * If we are trying to force some double word entries just * assume the worst-case of every single word entry being * written as a double word entry. */ uint64_t entry_size = (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && zfs_force_some_double_word_sm_entries) ? (2 * sizeof (uint64_t)) : sizeof (uint64_t); uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; for (; idx <= single_entry_max_bucket; idx++) size += histogram[idx] * entry_size; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, single_entry_max_bucket); entries_for_seg = 1ULL << (idx - single_entry_max_bucket); size += histogram[idx] * entries_for_seg * entry_size; } return (size); } } ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; for (; idx <= double_entry_max_bucket; idx++) size += histogram[idx] * 2 * sizeof (uint64_t); - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, double_entry_max_bucket); entries_for_seg = 1ULL << (idx - double_entry_max_bucket); size += histogram[idx] * entries_for_seg * 2 * sizeof (uint64_t); } /* * Assume the worst case where we start with the padding at the end * of the current block and we add an extra padding entry at the end * of all subsequent blocks. */ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); return (size); } uint64_t space_map_object(space_map_t *sm) { return (sm != NULL ? sm->sm_object : 0); } int64_t space_map_allocated(space_map_t *sm) { return (sm != NULL ? sm->sm_phys->smp_alloc : 0); } uint64_t space_map_length(space_map_t *sm) { return (sm != NULL ? sm->sm_phys->smp_length : 0); } uint64_t space_map_nblocks(space_map_t *sm) { if (sm == NULL) return (0); return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 40fd75b83639..74e36c0300f0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6588 +1,6588 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_dio_verify_rl, &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); zfs_range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } zfs_range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; zfs_range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; if (spa->spa_gcd_alloc == INT_MAX) { spa->spa_gcd_alloc = min_alloc; } else { spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); /* * If this probe was initiated from zio pipeline, then * change the state in a spa_async_request. Probes that * were initiated from a vdev_open can change the state * as part of the open call. */ if (vps->vps_zio_done_probe) { vd->vdev_fault_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); } } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; vd->vdev_remove_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_contains(rt, txg, size)) zfs_range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(rt)) dirty = zfs_range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = zfs_range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ static void vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess_impl(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!zfs_range_tree_is_empty( vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev * as if they are not available. It's more likely than not that * a vdev in a replacing vdev could encounter read errors so * treat it as not being able to contribute. */ if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, rebuild_done, B_FALSE)); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } zfs_range_tree_vacate(rt, NULL, NULL); zfs_range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; zfs_range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); zfs_range_tree_vacate(rtsync, NULL, NULL); zfs_range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be * - offlined * - detached * - removed * - faulted * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, &vd->vdev_slow_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, &vd->vdev_slow_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See * vdev_raidz_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t -vdev_xlate_is_empty(range_seg64_t *rs) +vdev_xlate_is_empty(zfs_range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ - range_seg64_t intermediate = { 0 }; + zfs_range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void -vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, +vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { - range_seg64_t iter_rs = *logical_rs; - range_seg64_t physical_rs; - range_seg64_t remain_rs; + zfs_range_seg64_t iter_rs = *logical_rs; + zfs_range_seg64_t physical_rs; + zfs_range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; case VDEV_PROP_SLOW_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_n = intval; break; case VDEV_PROP_SLOW_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_TRIM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_trim_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_SLOW_IOS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_slow_ios, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_has_trim, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za->za_name; switch (za->za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl, param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW, "RAIDZ implementation"); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 419c8ac5bb28..45f8bcfbd4ed 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1,2821 +1,2821 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_draid_io_verify() */ #endif /* * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is * comprised of multiple raidz redundancy groups which are spread over the * dRAID children. To ensure an even distribution, and avoid hot spots, a * permutation mapping is applied to the order of the dRAID children. * This mixing effectively distributes the parity columns evenly over all * of the disks in the dRAID. * * This is beneficial because it means when resilvering all of the disks * can participate thereby increasing the available IOPs and bandwidth. * Furthermore, by reserving a small fraction of each child's total capacity * virtual distributed spare disks can be created. These spares similarly * benefit from the performance gains of spanning all of the children. The * consequence of which is that resilvering to a distributed spare can * substantially reduce the time required to restore full parity to pool * with a failed disks. * * === dRAID group layout === * * First, let's define a "row" in the configuration to be a 16M chunk from * each physical drive at the same offset. This is the minimum allowable * size since it must be possible to store a full 16M block when there is * only a single data column. Next, we define a "group" to be a set of * sequential disks containing both the parity and data columns. We allow * groups to span multiple rows in order to align any group size to any * number of physical drives. Finally, a "slice" is comprised of the rows * which contain the target number of groups. The permutation mappings * are applied in a round robin fashion to each slice. * * Given D+P drives in a group (including parity drives) and C-S physical * drives (not including the spare drives), we can distribute the groups * across R rows without remainder by selecting the least common multiple * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). * * In the example below, there are C=14 physical drives in the configuration * with S=2 drives worth of spare capacity. Each group has a width of 9 * which includes D=8 data and P=1 parity drive. There are 4 groups and * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice * size is 576M (144M * 4). When allocating from a dRAID each group is * filled before moving on to the next as show in slice0 below. * * data disks (8 data + 1 parity) spares (2) * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | | group 0 | group 1..| | * | +-----------------------------------+-----------+-------| * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 * s +-----------------------+-----------------------+-------+ * l | ..group 1 | group 2.. | | * i +-----------------------+-----------------------+-------+ * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 * | +-----------+-----------+-----------------------+-------+ * | |..group 2 | group 3 | | * | +-----------+-----------+-----------------------+-------+ * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * l | group 4 | group 5..| | row 3 * i +-----------------------+-----------+-----------+-------| * c | ..group 5 | group 6.. | | row 4 * e +-----------+-----------+-----------------------+-------+ * 1 |..group 6 | group 7 | | row 5 * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ * l | group 8 | group 9..| | row 6 * i +-----------------------------------------------+-------| * c | ..group 9 | group 10.. | | row 7 * e +-----------------------+-----------------------+-------+ * 2 |..group 10 | group 11 | | row 8 * +-----------+-----------------------------------+-------+ * * This layout has several advantages over requiring that each row contain * a whole number of groups. * * 1. The group count is not a relevant parameter when defining a dRAID * layout. Only the group width is needed, and *all* groups will have * the desired size. * * 2. All possible group widths (<= physical disk count) can be supported. * * 3. The logic within vdev_draid.c is simplified when the group width is * the same for all groups (although some of the logic around computing * permutation numbers and drive offsets is more complicated). * * N.B. The following array describes all valid dRAID permutation maps. * Each row is used to generate a permutation map for a different number * of children from a unique seed. The seeds were generated and carefully * evaluated by the 'draid' utility in order to provide balanced mappings. * In addition to the seed a checksum of the in-memory mapping is stored * for verification. * * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, * with a given permutation map) is the ratio of the amounts of I/O that will * be sent to the least and most busy disks when resilvering. The average * imbalance ratio (of a given number of disks and permutation map) is the * average of the ratios of all possible single and double disk failures. * * In order to achieve a low imbalance ratio the number of permutations in * the mapping must be significantly larger than the number of children. * For dRAID the number of permutations has been limited to 512 to minimize * the map size. This does result in a gradually increasing imbalance ratio * as seen in the table below. Increasing the number of permutations for * larger child counts would reduce the imbalance ratio. However, in practice * when there are a large number of children each child is responsible for * fewer total IOs so it's less of a concern. * * Note these values are hard coded and must never be changed. Existing * pools depend on the same mapping always being generated in order to * read and write from the correct locations. Any change would make * existing pools completely inaccessible. */ static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ }; /* * Verify the map is valid. Each device index must appear exactly * once in every row, and the permutation array checksum must match. */ static int verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, uint64_t checksum) { int countssz = sizeof (uint16_t) * children; uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); for (int i = 0; i < nperms; i++) { for (int j = 0; j < children; j++) { uint8_t val = perms[(i * children) + j]; if (val >= children || counts[val] != i) { kmem_free(counts, countssz); return (EINVAL); } counts[val]++; } } if (checksum != 0) { int permssz = sizeof (uint8_t) * children * nperms; zio_cksum_t cksum; fletcher_4_native_varsize(perms, permssz, &cksum); if (checksum != cksum.zc_word[0]) { kmem_free(counts, countssz); return (ECKSUM); } } kmem_free(counts, countssz); return (0); } /* * Generate the permutation array for the draid_map_t. These maps control * the placement of all data in a dRAID. Therefore it's critical that the * seed always generates the same mapping. We provide our own pseudo-random * number generator for this purpose. */ int vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) { VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); VERIFY3U(map->dm_seed, !=, 0); VERIFY3U(map->dm_nperms, !=, 0); VERIFY3P(map->dm_perms, ==, NULL); #ifdef _KERNEL /* * The kernel code always provides both a map_seed and checksum. * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide * a zero checksum when generating new candidate maps. */ VERIFY3U(map->dm_checksum, !=, 0); #endif uint64_t children = map->dm_children; uint64_t nperms = map->dm_nperms; int rowsz = sizeof (uint8_t) * children; int permssz = rowsz * nperms; uint8_t *perms; /* Allocate the permutation array */ perms = vmem_alloc(permssz, KM_SLEEP); /* Setup an initial row with a known pattern */ uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); for (int i = 0; i < children; i++) initial_row[i] = i; uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; uint8_t *current_row, *previous_row = initial_row; /* * Perform a Fisher-Yates shuffle of each row using the previous * row as the starting point. An initial_row with known pattern * is used as the input for the first row. */ for (int i = 0; i < nperms; i++) { current_row = &perms[i * children]; memcpy(current_row, previous_row, rowsz); for (int j = children - 1; j > 0; j--) { uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); uint8_t val = current_row[j]; current_row[j] = current_row[k]; current_row[k] = val; } previous_row = current_row; } kmem_free(initial_row, rowsz); int error = verify_perms(perms, children, nperms, map->dm_checksum); if (error) { vmem_free(perms, permssz); return (error); } *permsp = perms; return (0); } /* * Lookup the fixed draid_map_t for the requested number of children. */ int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); } } return (ENOENT); } /* * Lookup the permutation array and iteration id for the provided offset. */ static void vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, uint8_t **base, uint64_t *iter) { uint64_t ncols = vdc->vdc_children; uint64_t poff = pindex % (vdc->vdc_nperms * ncols); *base = vdc->vdc_perms + (poff / ncols) * ncols; *iter = poff % ncols; } static inline uint64_t vdev_draid_permute_id(vdev_draid_config_t *vdc, uint8_t *base, uint64_t iter, uint64_t index) { return ((base[index] + iter) % vdc->vdc_children); } /* * Return the asize which is the psize rounded up to a full group width. * i.e. vdev_draid_psize_to_asize(). */ static uint64_t vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; ASSERT3U(asize, !=, 0); ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); return (asize); } /* * Deflate the asize to the psize, this includes stripping parity. */ uint64_t vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT0(asize % vdc->vdc_groupwidth); return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); } /* * Convert a logical offset to the corresponding group number. */ static uint64_t vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (offset / vdc->vdc_groupsz); } /* * Convert a group number to the logical starting offset for that group. */ static uint64_t vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (group * vdc->vdc_groupsz); } /* * Full stripe writes. When writing, all columns (D+P) are required. Parity * is calculated over all the columns, including empty zero filled sectors, * and each is written to disk. While only the data columns are needed for * a normal read, all of the columns are required for reconstruction when * performing a sequential resilver. * * For "big columns" it's sufficient to map the correct range of the zio ABD. * Partial columns require allocating a gang ABD in order to zero fill the * empty sectors. When the column is empty a zero filled sector must be * mapped. In all cases the data ABDs must be the same size as the parity * ABDs (e.g. rc->rc_size == parity_size). */ static void vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t abd_off = abd_offset; ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small write), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); rc->rc_abd = abd_get_zeros(skip_size); } else if (rc->rc_size == parity_size) { /* this is a "big column" */ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), B_TRUE); } ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); abd_off += rc->rc_size; rc->rc_size = parity_size; } IMPLY(abd_offset != 0, abd_off == zio->io_size); } /* * Scrub/resilver reads. In order to store the contents of the skip sectors * an additional ABD is allocated. The columns are handled in the same way * as a full stripe write except instead of using the zero ABD the newly * allocated skip ABD is used to back the skip sectors. In all cases the * data ABD must be the same size as the parity ABDs. */ static void vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t abd_off = abd_offset; uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, ==, NULL); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, B_FALSE); } for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small read), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, skip_off, skip_size); skip_off += skip_size; } else if (rc->rc_size == parity_size) { /* this is a "big column" */ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); skip_off += skip_size; } uint64_t abd_size = abd_get_size(rc->rc_abd); ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); /* * Increase rc_size so the skip ABD is included in subsequent * parity calculations. */ abd_off += rc->rc_size; rc->rc_size = abd_size; } IMPLY(abd_offset != 0, abd_off == zio->io_size); ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); } /* * Normal reads. In this common case only the columns containing data * are read in to the zio ABDs. Neither the parity columns or empty skip * sectors are read unless the checksum fails verification. In which case * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand * the raid map in order to allow reconstruction using the parity data and * skip sectors. */ static void vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) { uint64_t abd_off = abd_offset; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size > 0) { rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, abd_off, rc->rc_size); abd_off += rc->rc_size; } } IMPLY(abd_offset != 0, abd_off == zio->io_size); } /* * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key * difference is that an ABD is allocated to back skip sectors so they may * be read in to memory, verified, and repaired if needed. */ void vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, ==, NULL); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, B_FALSE); } for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) { /* empty data column (small read), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3P(rc->rc_abd, ==, NULL); rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, skip_off, skip_size); skip_off += skip_size; } else if (rc->rc_size == parity_size) { /* this is a "big column", nothing to add */ ASSERT3P(rc->rc_abd, !=, NULL); } else { /* * short data column, add a skip sector and clear * rc_tried to force the entire column to be re-read * thereby including the missing skip sector data * which is needed for reconstruction. */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3P(rc->rc_abd, !=, NULL); ASSERT(!abd_is_gang(rc->rc_abd)); abd_t *read_abd = rc->rc_abd; rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, read_abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); skip_off += skip_size; rc->rc_tried = 0; } /* * Increase rc_size so the empty ABD is included in subsequent * parity calculations. */ rc->rc_size = parity_size; } ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); } /* * Verify that all empty sectors are zero filled before using them to * calculate parity. Otherwise, silent corruption in an empty sector will * result in bad parity being generated. That bad parity will then be * considered authoritative and overwrite the good parity on disk. This * is possible because the checksum is only calculated over the data, * thus it cannot be used to detect damage in empty sectors. */ int vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr) { uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; uint64_t parity_size = rr->rr_col[0].rc_size; uint64_t skip_off = parity_size - skip_size; uint64_t empty_off = 0; int ret = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); ASSERT3P(rr->rr_abd_empty, !=, NULL); ASSERT3U(rr->rr_bigcols, >, 0); void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP); for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT3P(rc->rc_abd, !=, NULL); ASSERT3U(rc->rc_size, ==, parity_size); if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off, skip_size) != 0) { vdev_raidz_checksum_error(zio, rc, rc->rc_abd); abd_zero_off(rc->rc_abd, skip_off, skip_size); rc->rc_error = SET_ERROR(ECKSUM); ret++; } empty_off += skip_size; } ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty)); kmem_free(zero_buf, skip_size); return (ret); } /* * Given a logical address within a dRAID configuration, return the physical * address on the first drive in the group that this address maps to * (at position 'start' in permutation number 'perm'). */ static uint64_t vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, uint64_t *perm, uint64_t *start) { vdev_draid_config_t *vdc = vd->vdev_tsd; /* b is the dRAID (parent) sector offset. */ uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t b_offset = logical_offset >> ashift; /* * The height of a row in units of the vdev's minimum sector size. * This is the amount of data written to each disk of each group * in a given permutation. */ uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; /* * We cycle through a disk permutation every groupsz * ngroups chunk * of address space. Note that ngroups * groupsz must be a multiple * of the number of data drives (ndisks) in order to guarantee * alignment. So, for example, if our row height is 16MB, our group * size is 10, and there are 13 data drives in the draid, then ngroups * will be 13, we will change permutation every 2.08GB and each * disk will have 160MB of data per chunk. */ uint64_t groupwidth = vdc->vdc_groupwidth; uint64_t ngroups = vdc->vdc_ngroups; uint64_t ndisks = vdc->vdc_ndisks; /* * groupstart is where the group this IO will land in "starts" in * the permutation array. */ uint64_t group = logical_offset / vdc->vdc_groupsz; uint64_t groupstart = (group * groupwidth) % ndisks; ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); *start = groupstart; /* b_offset is the sector offset within a group chunk */ b_offset = b_offset % (rowheight_sectors * groupwidth); ASSERT0(b_offset % groupwidth); /* * Find the starting byte offset on each child vdev: * - within a permutation there are ngroups groups spread over the * rows, where each row covers a slice portion of the disk * - each permutation has (groupwidth * ngroups) / ndisks rows * - so each permutation covers rows * slice portion of the disk * - so we need to find the row where this IO group target begins */ *perm = group / ngroups; uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + (((group % ngroups) * groupwidth) / ndisks); return (((rowheight_sectors * row) + (b_offset / groupwidth)) << ashift); } static uint64_t vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, uint64_t abd_offset, uint64_t abd_size) { vdev_t *vd = zio->io_vd; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); /* * Limit the io_size to the space remaining in the group. A second * row in the raidz_map_t is created for the remainder. */ if (io_offset + io_asize > start_offset) { io_size = vdev_draid_asize_to_psize(vd, start_offset - io_offset); } /* * At most a block may span the logical end of one group and the start * of the next group. Therefore, at the end of a group the io_size must * span the group width evenly and the remainder must be aligned to the * start of the next group. */ IMPLY(abd_offset == 0 && io_size < zio->io_size, (io_asize >> ashift) % vdc->vdc_groupwidth == 0); IMPLY(abd_offset != 0, vdev_draid_group_to_offset(vd, group) == io_offset); /* Lookup starting byte offset on each child vdev */ uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, io_offset, &perm, &groupstart); /* * If there is less than groupwidth drives available after the group * start, the group is going to wrap onto the next row. 'wrap' is the * group disk number that starts on the next row. */ uint64_t ndisks = vdc->vdc_ndisks; uint64_t groupwidth = vdc->vdc_groupwidth; uint64_t wrap = groupwidth; if (groupstart + groupwidth > ndisks) wrap = ndisks - groupstart; /* The io size in units of the vdev's minimum sector size. */ const uint64_t psize = io_size >> ashift; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ uint64_t q = psize / vdc->vdc_ndata; /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = psize - q * vdc->vdc_ndata; /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); ASSERT3U(bc, <, groupwidth); /* The total number of data and parity sectors for this I/O. */ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); ASSERT3U(vdc->vdc_nparity, >, 0); raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio); rr->rr_bigcols = bc; rr->rr_firstdatacol = vdc->vdc_nparity; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; #endif *rrp = rr; uint8_t *base; uint64_t iter, asize = 0; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < groupwidth; i++) { raidz_col_t *rc = &rr->rr_col[i]; uint64_t c = (groupstart + i) % ndisks; /* increment the offset if we wrap to the next row */ if (i == wrap) physical_offset += VDEV_DRAID_ROWHEIGHT; rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; if (q == 0 && i >= bc) rc->rc_size = 0; else if (i < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rr->rr_nempty = roundup(tot, groupwidth) - tot; IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); /* Allocate buffers for the parity columns */ for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* * Map buffers for data columns and allocate/map buffers for skip * sectors. There are three distinct cases for dRAID which are * required to support sequential rebuild. */ if (zio->io_type == ZIO_TYPE_WRITE) { vdev_draid_map_alloc_write(zio, abd_offset, rr); } else if ((rr->rr_nempty > 0) && (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { vdev_draid_map_alloc_scrub(zio, abd_offset, rr); } else { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); vdev_draid_map_alloc_read(zio, abd_offset, rr); } return (io_size); } /* * Allocate the raidz mapping to be applied to the dRAID I/O. The parity * calculations for dRAID are identical to raidz however there are a few * differences in the layout. * * - dRAID always allocates a full stripe width. Any extra sectors due * this padding are zero filled and written to disk. They will be read * back during a scrub or repair operation since they are included in * the parity calculation. This property enables sequential resilvering. * * - When the block at the logical offset spans redundancy groups then two * rows are allocated in the raidz_map_t. One row resides at the end of * the first group and the other at the start of the following group. */ static raidz_map_t * vdev_draid_map_alloc(zio_t *zio) { raidz_row_t *rr[2]; uint64_t abd_offset = 0; uint64_t abd_size = zio->io_size; uint64_t io_offset = zio->io_offset; uint64_t size; int nrows = 1; size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, abd_offset, abd_size); if (size < abd_size) { vdev_t *vd = zio->io_vd; io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( vd, vdev_draid_offset_to_group(vd, io_offset))); ASSERT3U(abd_offset, <, zio->io_size); ASSERT3U(abd_size, !=, 0); size = vdev_draid_map_alloc_row(zio, &rr[1], io_offset, abd_offset, abd_size); VERIFY3U(size, ==, abd_size); } raidz_map_t *rm; rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); rm->rm_ops = vdev_raidz_math_get_ops(); rm->rm_nrows = nrows; rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; return (rm); } /* * Given an offset into a dRAID return the next group width aligned offset * which can be used to start an allocation. */ static uint64_t vdev_draid_get_astart(vdev_t *vd, const uint64_t start) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); } /* * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) * rounded down to the last full slice. So each child must provide at least * 1 / (children - nspares) of its asize. */ static uint64_t vdev_draid_min_asize(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (VDEV_DRAID_REFLOW_RESERVE + (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); } /* * When using dRAID the minimum allocation size is determined by the number * of data disks in the redundancy group. Full stripes are always used. */ static uint64_t vdev_draid_min_alloc(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); return (vdc->vdc_ndata << vd->vdev_ashift); } /* * Returns true if the txg range does not exist on any leaf vdev. * * A dRAID spare does not fit into the DTL model. While it has child vdevs * there is no redundancy among them, and the effective child vdev is * determined by offset. Essentially we do a vdev_dtl_reassess() on the * fly by replacing a dRAID spare with the child vdev under the offset. * Note that it is a recursive process because the child vdev can be * another dRAID spare and so on. */ boolean_t vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, uint64_t size) { if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { /* * Check all of the readable children, if any child * contains the txg range the data it is not missing. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (!vdev_draid_missing(cvd, physical_offset, txg, size)) return (B_FALSE); } return (B_TRUE); } if (vd->vdev_ops == &vdev_draid_spare_ops) { /* * When sequentially resilvering we don't have a proper * txg range so instead we must presume all txgs are * missing on this vdev until the resilver completes. */ if (vd->vdev_rebuild_txg != 0) return (B_TRUE); /* * DTL_MISSING is set for all prior txgs when a resilver * is started in spa_vdev_attach(). */ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) return (B_TRUE); /* * Consult the DTL on the relevant vdev. Either a vdev * leaf or spare/replace mirror child may be returned so * we must recursively call vdev_draid_missing_impl(). */ vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_TRUE); return (vdev_draid_missing(vd, physical_offset, txg, size)); } return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Returns true if the txg is only partially replicated on the leaf vdevs. */ static boolean_t vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, uint64_t size) { if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { /* * Check all of the readable children, if any child is * missing the txg range then it is partially replicated. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (vdev_draid_partial(cvd, physical_offset, txg, size)) return (B_TRUE); } return (B_FALSE); } if (vd->vdev_ops == &vdev_draid_spare_ops) { /* * When sequentially resilvering we don't have a proper * txg range so instead we must presume all txgs are * missing on this vdev until the resilver completes. */ if (vd->vdev_rebuild_txg != 0) return (B_TRUE); /* * DTL_MISSING is set for all prior txgs when a resilver * is started in spa_vdev_attach(). */ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) return (B_TRUE); /* * Consult the DTL on the relevant vdev. Either a vdev * leaf or spare/replace mirror child may be returned so * we must recursively call vdev_draid_missing_impl(). */ vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_TRUE); return (vdev_draid_partial(vd, physical_offset, txg, size)); } return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Determine if the vdev is readable at the given offset. */ boolean_t vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) { if (vd->vdev_ops == &vdev_draid_spare_ops) { vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_FALSE); } if (vd->vdev_ops == &vdev_spare_ops || vd->vdev_ops == &vdev_replacing_ops) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (!vdev_readable(cvd)) continue; if (vdev_draid_readable(cvd, physical_offset)) return (B_TRUE); } return (B_FALSE); } return (vdev_readable(vd)); } /* * Returns the first distributed spare found under the provided vdev tree. */ static vdev_t * vdev_draid_find_spare(vdev_t *vd) { if (vd->vdev_ops == &vdev_draid_spare_ops) return (vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); if (svd != NULL) return (svd); } return (NULL); } /* * Returns B_TRUE if the passed in vdev is currently "faulted". * Faulted, in this context, means that the vdev represents a * replacing or sparing vdev tree. */ static boolean_t vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) { if (vd->vdev_ops == &vdev_draid_spare_ops) { vd = vdev_draid_spare_get_child(vd, physical_offset); if (vd == NULL) return (B_FALSE); /* * After resolving the distributed spare to a leaf vdev * check the parent to determine if it's "faulted". */ vd = vd->vdev_parent; } return (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); } /* * Determine if the dRAID block at the logical offset is degraded. * Used by sequential resilver. */ static boolean_t vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, offset, &perm, &groupstart); uint8_t *base; uint64_t iter; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); vdev_t *cvd = vd->vdev_child[cid]; /* Group contains a faulted vdev. */ if (vdev_draid_faulted(cvd, physical_offset)) return (B_TRUE); /* * Always check groups with active distributed spares * because any vdev failure in the pool will affect them. */ if (vdev_draid_find_spare(cvd) != NULL) return (B_TRUE); } return (B_FALSE); } /* * Determine if the txg is missing. Used by healing resilver. */ static boolean_t vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); uint64_t groupstart, perm; uint64_t physical_offset = vdev_draid_logical_to_physical(vd, offset, &perm, &groupstart); uint8_t *base; uint64_t iter; vdev_draid_get_perm(vdc, perm, &base, &iter); for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); vdev_t *cvd = vd->vdev_child[cid]; /* Transaction group is known to be partially replicated. */ if (vdev_draid_partial(cvd, physical_offset, txg, size)) return (B_TRUE); /* * Always check groups with active distributed spares * because any vdev failure in the pool will affect them. */ if (vdev_draid_find_spare(cvd) != NULL) return (B_TRUE); } return (B_FALSE); } /* * Find the smallest child asize and largest sector size to calculate the * available capacity. Distributed spares are ignored since their capacity * is also based of the minimum child size in the top-level dRAID. */ static void vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) { uint64_t logical_ashift = 0, physical_ashift = 0; uint64_t asize = 0, max_asize = 0; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_ops == &vdev_draid_spare_ops) continue; asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); } for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_ops == &vdev_draid_spare_ops) continue; physical_ashift = vdev_best_ashift(logical_ashift, physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; *max_asizep = max_asize; *logical_ashiftp = logical_ashift; *physical_ashiftp = physical_ashift; } /* * Open spare vdevs. */ static boolean_t vdev_draid_open_spares(vdev_t *vd) { return (vd->vdev_ops == &vdev_draid_spare_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); } /* * Open all children, excluding spares. */ static boolean_t vdev_draid_open_children(vdev_t *vd) { return (!vdev_draid_open_spares(vd)); } /* * Open a top-level dRAID vdev. */ static int vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t nparity = vdc->vdc_nparity; int open_errors = 0; if (nparity > VDEV_DRAID_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * First open the normal children then the distributed spares. This * ordering is important to ensure the distributed spares calculate * the correct psize in the event that the dRAID vdevs were expanded. */ vdev_open_children_subset(vd, vdev_draid_open_children); vdev_open_children_subset(vd, vdev_draid_open_spares); /* Verify enough of the children are available to continue. */ for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_open_error != 0) { if ((++open_errors) > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (SET_ERROR(ENXIO)); } } } /* * Allocatable capacity is the sum of the space on all children less * the number of distributed spares rounded down to last full row * and then to the last full group. An additional 32MB of scratch * space is reserved at the end of each child for use by the dRAID * expansion feature. */ uint64_t child_asize, child_max_asize; vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, logical_ashift, physical_ashift); /* * Should be unreachable since the minimum child size is 64MB, but * we want to make sure an underflow absolutely cannot occur here. */ if (child_asize < VDEV_DRAID_REFLOW_RESERVE || child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { return (SET_ERROR(ENXIO)); } child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * vdc->vdc_groupsz); return (0); } /* * Close a top-level dRAID vdev. */ static void vdev_draid_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } /* * Return the maximum asize for a rebuild zio in the provided range * given the following constraints. A dRAID chunks may not: * * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or * - Span dRAID redundancy groups. */ static uint64_t vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t max_segment) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t ashift = vd->vdev_ashift; uint64_t ndata = vdc->vdc_ndata; uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), SPA_MAXBLOCKSIZE); ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); /* Chunks must evenly span all data columns in the group. */ psize = (((psize >> ashift) / ndata) * ndata) << ashift; uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); /* Reduce the chunk size to the group space remaining. */ uint64_t group = vdev_draid_offset_to_group(vd, start); uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; chunk_size = MIN(chunk_size, left); ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, vdev_draid_offset_to_group(vd, start + chunk_size - 1)); return (chunk_size); } /* * Align the start of the metaslab to the group width and slightly reduce * its size to a multiple of the group width. Since full stripe writes are * required by dRAID this space is unallocable. Furthermore, aligning the * metaslab start is important for vdev initialize and TRIM which both operate * on metaslab boundaries which vdev_xlate() expects to be aligned. */ static void vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; uint64_t astart = vdev_draid_get_astart(vd, *ms_start); uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; *ms_start = astart; *ms_size = asize; ASSERT0(*ms_start % sz); ASSERT0(*ms_size % sz); } /* * Add virtual dRAID spares to the list of valid spares. In order to accomplish * this the existing array must be freed and reallocated with the additional * entries. */ int vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, uint64_t next_vdev_id) { uint64_t draid_nspares = 0; uint64_t ndraid = 0; int error; for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_ops == &vdev_draid_ops) { vdev_draid_config_t *vdc = cvd->vdev_tsd; draid_nspares += vdc->vdc_nspares; ndraid++; } } if (draid_nspares == 0) { *ndraidp = ndraid; return (0); } nvlist_t **old_spares, **new_spares; uint_t old_nspares; error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &old_spares, &old_nspares); if (error) old_nspares = 0; /* Allocate memory and copy of the existing spares. */ new_spares = kmem_alloc(sizeof (nvlist_t *) * (draid_nspares + old_nspares), KM_SLEEP); for (uint_t i = 0; i < old_nspares; i++) new_spares[i] = fnvlist_dup(old_spares[i]); /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ uint64_t n = old_nspares; for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { vdev_t *cvd = vd->vdev_child[vdev_id]; char path[64]; if (cvd->vdev_ops != &vdev_draid_ops) continue; vdev_draid_config_t *vdc = cvd->vdev_tsd; uint64_t nspares = vdc->vdc_nspares; uint64_t nparity = vdc->vdc_nparity; for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { memset(path, 0, sizeof (path)); (void) snprintf(path, sizeof (path) - 1, "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, (u_longlong_t)nparity, (u_longlong_t)next_vdev_id + vdev_id, (u_longlong_t)spare_id); nvlist_t *spare = fnvlist_alloc(); fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE); fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, cvd->vdev_guid); fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, spare_id); fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, cvd->vdev_ashift); new_spares[n] = spare; n++; } } if (n > 0) { (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t **)new_spares, n); } for (int i = 0; i < n; i++) nvlist_free(new_spares[i]); kmem_free(new_spares, sizeof (*new_spares) * n); *ndraidp = ndraid; return (0); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. */ static boolean_t vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* * Sequential resilver. There is no meaningful phys_birth * for this block, we can only determine if block resides * in a degraded group in which case it must be resilvered. */ ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, vdev_draid_offset_to_group(vd, offset + asize - 1)); return (vdev_draid_group_degraded(vd, offset)); } else { /* * Healing resilver. TXGs not in DTL_PARTIAL are intact, * as are blocks in non-degraded groups. */ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) return (B_TRUE); /* The block may span groups in which case check both. */ if (vdev_draid_offset_to_group(vd, offset) != vdev_draid_offset_to_group(vd, offset + asize - 1)) { if (vdev_draid_group_missing(vd, offset + asize, phys_birth, 1)) return (B_TRUE); } return (B_FALSE); } } static boolean_t vdev_draid_rebuilding(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (vdev_draid_rebuilding(vd->vdev_child[i])) { return (B_TRUE); } } return (B_FALSE); } static void vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); #endif } /* * For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() * if a skip sector needs to be added to a column. */ static void vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * Empty columns are zero filled and included in the parity * calculation and therefore must be written. */ ASSERT3U(rc->rc_size, !=, 0); /* Verify physical to logical translation */ vdev_draid_io_verify(vd, rr, c); zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } /* * For read operations: * 1. The vdev_draid_map_alloc() function will create a minimal raidz * mapping for the read based on the zio->io_flags. There are two * possible mappings either 1) a normal read, or 2) a scrub/resilver. * 2. Create the zio read operations. This will include all parity * columns and skip sectors for a scrub/resilver. */ static void vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; /* Sequential rebuild must do IO at redundancy group boundary. */ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); /* * Iterate over the columns in reverse order so that we hit the parity * last. Any errors along the way will force us to read the parity. * For scrub/resilver IOs which verify skip sectors, a gang ABD will * have been allocated to store them and rc->rc_size is increased. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_draid_readable(cvd, rc->rc_offset)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; rc->rc_skipped = 1; continue; } if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } /* * Empty columns may be read during vdev_draid_io_done(). * Only skip them after the readable and missing checks * verify they are available. */ if (rc->rc_size == 0) { rc->rc_skipped = 1; continue; } if (zio->io_flags & ZIO_FLAG_RESILVER) { vdev_t *svd; /* * Sequential rebuilds need to always consider the data * on the child being rebuilt to be stale. This is * important when all columns are available to aid * known reconstruction in identifing which columns * contain incorrect data. * * Furthermore, all repairs need to be constrained to * the devices being rebuilt because without a checksum * we cannot verify the data is actually correct and * performing an incorrect repair could result in * locking in damage and making the data unrecoverable. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { if (vdev_draid_rebuilding(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; rc->rc_allow_repair = 1; continue; } else { rc->rc_allow_repair = 0; } } else { rc->rc_allow_repair = 1; } /* * If this child is a distributed spare then the * offset might reside on the vdev being replaced. * In which case this data must be written to the * new device. Failure to do so would result in * checksum errors when the old device is detached * and the pool is scrubbed. */ if ((svd = vdev_draid_find_spare(cvd)) != NULL) { svd = vdev_draid_spare_get_child(svd, rc->rc_offset); if (svd && (svd->vdev_ops == &vdev_spare_ops || svd->vdev_ops == &vdev_replacing_ops)) { rc->rc_force_repair = 1; if (vdev_draid_rebuilding(svd)) rc->rc_allow_repair = 1; } } /* * Always issue a repair IO to this child when its * a spare or replacing vdev with an active rebuild. */ if ((cvd->vdev_ops == &vdev_spare_ops || cvd->vdev_ops == &vdev_replacing_ops) && vdev_draid_rebuilding(cvd)) { rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } } } /* * Either a parity or data column is missing this means a repair * may be attempted by vdev_draid_io_done(). Expand the raid map * to read in empty columns which are needed along with the parity * during reconstruction. */ if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { vdev_draid_map_alloc_empty(zio, rr); } for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_error || rc->rc_size == 0) continue; if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } /* * Start an IO operation to a dRAID vdev. */ static void vdev_draid_io_start(zio_t *zio) { vdev_t *vd __maybe_unused = zio->io_vd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); raidz_map_t *rm = vdev_draid_map_alloc(zio); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_draid_io_start_write(zio, rm->rm_row[i]); } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); for (int i = 0; i < rm->rm_nrows; i++) { vdev_draid_io_start_read(zio, rm->rm_row[i]); } } zio_execute(zio); } /* * Complete an IO operation on a dRAID vdev. The raidz logic can be applied * to dRAID since the layout is fully described by the raidz_map_t. */ static void vdev_draid_io_done(zio_t *zio) { vdev_raidz_io_done(zio); } static void vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) { vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT(vd->vdev_ops == &vdev_draid_ops); if (faulted > vdc->vdc_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } static void -vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_draid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_draid_ops); vdev_draid_config_t *vdc = raidvd->vdev_tsd; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* Make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t logical_start = logical_rs->rs_start; uint64_t logical_end = logical_rs->rs_end; /* * Unaligned ranges must be skipped. All metaslabs are correctly * aligned so this should not happen, but this case is handled in * case it's needed by future callers. */ uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); if (astart != logical_start) { physical_rs->rs_start = logical_start; physical_rs->rs_end = logical_start; remain_rs->rs_start = MIN(astart, logical_end); remain_rs->rs_end = logical_end; return; } /* * Unlike with mirrors and raidz a dRAID logical range can map * to multiple non-contiguous physical ranges. This is handled by * limiting the size of the logical range to a single group and * setting the remain argument such that it describes the remaining * unmapped logical range. This is stricter than absolutely * necessary but helps simplify the logic below. */ uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); if (logical_end > nextstart) logical_end = nextstart; /* Find the starting offset for each vdev in the group */ uint64_t perm, groupstart; uint64_t start = vdev_draid_logical_to_physical(raidvd, logical_start, &perm, &groupstart); uint64_t end = start; uint8_t *base; uint64_t iter, id; vdev_draid_get_perm(vdc, perm, &base, &iter); /* * Check if the passed child falls within the group. If it does * update the start and end to reflect the physical range. * Otherwise, leave them unmodified which will result in an empty * (zero-length) physical range being returned. */ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { uint64_t c = (groupstart + i) % vdc->vdc_ndisks; if (c == 0 && i != 0) { /* the group wrapped, increment the start */ start += VDEV_DRAID_ROWHEIGHT; end = start; } id = vdev_draid_permute_id(vdc, base, iter, c); if (id == cvd->vdev_id) { uint64_t b_size = (logical_end >> ashift) - (logical_start >> ashift); ASSERT3U(b_size, >, 0); end = start + ((((b_size - 1) / vdc->vdc_groupwidth) + 1) << ashift); break; } } physical_rs->rs_start = start; physical_rs->rs_end = end; /* * Only top-level vdevs are allowed to set remain_rs because * when .vdev_op_xlate() is called for their children the full * logical range is not provided by vdev_xlate(). */ remain_rs->rs_start = logical_end; remain_rs->rs_end = logical_rs->rs_end; ASSERT3U(physical_rs->rs_start, <=, logical_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_end - logical_start); } /* * Add dRAID specific fields to the config nvlist. */ static void vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); vdev_draid_config_t *vdc = vd->vdev_tsd; fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); } /* * Initialize private dRAID specific fields from the nvlist. */ static int vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) { (void) spa; uint64_t ndata, nparity, nspares, ngroups; int error; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { return (SET_ERROR(EINVAL)); } uint_t children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children == 0 || children > VDEV_DRAID_MAX_CHILDREN) { return (SET_ERROR(EINVAL)); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || nspares > 100 || nspares > (children - (ndata + nparity))) { return (SET_ERROR(EINVAL)); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { return (SET_ERROR(EINVAL)); } /* * Validate the minimum number of children exist per group for the * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). */ if (children < (ndata + nparity + nspares)) return (SET_ERROR(EINVAL)); /* * Create the dRAID configuration using the pool nvlist configuration * and the fixed mapping for the correct number of children. */ vdev_draid_config_t *vdc; const draid_map_t *map; error = vdev_draid_lookup_map(children, &map); if (error) return (SET_ERROR(EINVAL)); vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); vdc->vdc_ndata = ndata; vdc->vdc_nparity = nparity; vdc->vdc_nspares = nspares; vdc->vdc_children = children; vdc->vdc_ngroups = ngroups; vdc->vdc_nperms = map->dm_nperms; error = vdev_draid_generate_perms(map, &vdc->vdc_perms); if (error) { kmem_free(vdc, sizeof (*vdc)); return (SET_ERROR(EINVAL)); } /* * Derived constants. */ vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / vdc->vdc_ndisks; ASSERT3U(vdc->vdc_groupwidth, >=, 2); ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % vdc->vdc_ndisks, ==, 0); *tsd = vdc; return (0); } static void vdev_draid_fini(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; vmem_free(vdc->vdc_perms, sizeof (uint8_t) * vdc->vdc_children * vdc->vdc_nperms); kmem_free(vdc, sizeof (*vdc)); } static uint64_t vdev_draid_nparity(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; return (vdc->vdc_nparity); } static uint64_t vdev_draid_ndisks(vdev_t *vd) { vdev_draid_config_t *vdc = vd->vdev_tsd; return (vdc->vdc_ndisks); } vdev_ops_t vdev_draid_ops = { .vdev_op_init = vdev_draid_init, .vdev_op_fini = vdev_draid_fini, .vdev_op_open = vdev_draid_open, .vdev_op_close = vdev_draid_close, .vdev_op_asize = vdev_draid_asize, .vdev_op_min_asize = vdev_draid_min_asize, .vdev_op_min_alloc = vdev_draid_min_alloc, .vdev_op_io_start = vdev_draid_io_start, .vdev_op_io_done = vdev_draid_io_done, .vdev_op_state_change = vdev_draid_state_change, .vdev_op_need_resilver = vdev_draid_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_draid_xlate, .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, .vdev_op_metaslab_init = vdev_draid_metaslab_init, .vdev_op_config_generate = vdev_draid_config_generate, .vdev_op_nparity = vdev_draid_nparity, .vdev_op_ndisks = vdev_draid_ndisks, .vdev_op_type = VDEV_TYPE_DRAID, .vdev_op_leaf = B_FALSE, }; /* * A dRAID distributed spare is a virtual leaf vdev which is included in the * parent dRAID configuration. The last N columns of the dRAID permutation * table are used to determine on which dRAID children a specific offset * should be written. These spare leaf vdevs can only be used to replace * faulted children in the same dRAID configuration. */ /* * Distributed spare state. All fields are set when the distributed spare is * first opened and are immutable. */ typedef struct { vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ uint64_t vds_top_guid; /* top-level parent dRAID guid */ uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ } vdev_draid_spare_t; /* * Returns the parent dRAID vdev to which the distributed spare belongs. * This may be safely called even when the vdev is not open. */ vdev_t * vdev_draid_spare_get_parent(vdev_t *vd) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); if (vds->vds_draid_vdev != NULL) return (vds->vds_draid_vdev); return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, vds->vds_top_guid)); } /* * A dRAID space is active when it's the child of a vdev using the * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. */ static boolean_t vdev_draid_spare_is_active(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_draid_ops)) { return (B_TRUE); } else { return (B_FALSE); } } /* * Given a dRAID distribute spare vdev, returns the physical child vdev * on which the provided offset resides. This may involve recursing through * multiple layers of distributed spares. Note that offset is relative to * this vdev. */ vdev_t * vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); /* The vdev is closed */ if (vds->vds_draid_vdev == NULL) return (NULL); vdev_t *tvd = vds->vds_draid_vdev; vdev_draid_config_t *vdc = tvd->vdev_tsd; ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); uint8_t *base; uint64_t iter; uint64_t perm = physical_offset / vdc->vdc_devslicesz; vdev_draid_get_perm(vdc, perm, &base, &iter); uint64_t cid = vdev_draid_permute_id(vdc, base, iter, (tvd->vdev_children - 1) - vds->vds_spare_id); vdev_t *cvd = tvd->vdev_child[cid]; if (cvd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_spare_get_child(cvd, physical_offset)); return (cvd); } static void vdev_draid_spare_close(vdev_t *vd) { vdev_draid_spare_t *vds = vd->vdev_tsd; vds->vds_draid_vdev = NULL; } /* * Opening a dRAID spare device is done by looking up the associated dRAID * top-level vdev guid from the spare configuration. */ static int vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_draid_spare_t *vds = vd->vdev_tsd; vdev_t *rvd = vd->vdev_spa->spa_root_vdev; uint64_t asize, max_asize; vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); if (tvd == NULL) { /* * When spa_vdev_add() is labeling new spares the * associated dRAID is not attached to the root vdev * nor does this spare have a parent. Simulate a valid * device in order to allow the label to be initialized * and the distributed spare added to the configuration. */ if (vd->vdev_parent == NULL) { *psize = *max_psize = SPA_MINDEVSIZE; *logical_ashift = *physical_ashift = ASHIFT_MIN; return (0); } return (SET_ERROR(EINVAL)); } vdev_draid_config_t *vdc = tvd->vdev_tsd; if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) return (SET_ERROR(EINVAL)); if (vds->vds_spare_id >= vdc->vdc_nspares) return (SET_ERROR(EINVAL)); /* * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here * because the caller may be vdev_draid_open() in which case the * values are stale as they haven't yet been updated by vdev_open(). * To avoid this always recalculate the dRAID asize and max_asize. */ vdev_draid_calculate_asize(tvd, &asize, &max_asize, logical_ashift, physical_ashift); *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vds->vds_draid_vdev = tvd; return (0); } /* * Completed distributed spare IO. Store the result in the parent zio * as if it had performed the operation itself. Only the first error is * preserved if there are multiple errors. */ static void vdev_draid_spare_child_done(zio_t *zio) { zio_t *pio = zio->io_private; /* * IOs are issued to non-writable vdevs in order to keep their * DTLs accurate. However, we don't want to propagate the * error in to the distributed spare's DTL. When resilvering * vdev_draid_need_resilver() will consult the relevant DTL * to determine if the data is missing and must be repaired. */ if (!vdev_writeable(zio->io_vd)) return; if (pio->io_error == 0) pio->io_error = zio->io_error; } /* * Returns a valid label nvlist for the distributed spare vdev. This is * used to bypass the IO pipeline to avoid the complexity of constructing * a complete label with valid checksum to return when read. */ nvlist_t * vdev_draid_read_config_spare(vdev_t *vd) { spa_t *spa = vd->vdev_spa; spa_aux_vdev_t *sav = &spa->spa_spares; uint64_t guid = vd->vdev_guid; nvlist_t *nv = fnvlist_alloc(); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, vdev_draid_spare_is_active(vd) ? POOL_STATE_ACTIVE : POOL_STATE_SPARE); /* Set the vdev guid based on the vdev list in sav_count. */ for (int i = 0; i < sav->sav_count; i++) { if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { guid = sav->sav_vdevs[i]->vdev_guid; break; } } fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); return (nv); } /* * Handle any flush requested of the distributed spare. All children must be * flushed. */ static int vdev_draid_spare_flush(zio_t *zio) { vdev_t *vd = zio->io_vd; int error = 0; for (int c = 0; c < vd->vdev_children; c++) { zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[c], zio->io_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } return (error); } /* * Initiate an IO to the distributed spare. For normal IOs this entails using * the zio->io_offset and permutation table to calculate which child dRAID vdev * is responsible for the data. Then passing along the zio to that child to * perform the actual IO. The label ranges are not stored on disk and require * some special handling which is described below. */ static void vdev_draid_spare_io_start(zio_t *zio) { vdev_t *cvd = NULL, *vd = zio->io_vd; vdev_draid_spare_t *vds = vd->vdev_tsd; uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vds == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_FLUSH: zio->io_error = vdev_draid_spare_flush(zio); break; case ZIO_TYPE_WRITE: if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { /* * Accept probe IOs and config writers to simulate the * existence of an on disk label. vdev_label_sync(), * vdev_uberblock_sync() and vdev_copy_uberblocks() * skip the distributed spares. This only leaves * vdev_label_init() which is allowed to succeed to * avoid adding special cases the function. */ if (zio->io_flags & ZIO_FLAG_PROBE || zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { zio->io_error = 0; } else { zio->io_error = SET_ERROR(EIO); } } else { cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } } break; case ZIO_TYPE_READ: if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { /* * Accept probe IOs to simulate the existence of a * label. vdev_label_read_config() bypasses the * pipeline to read the label configuration and * vdev_uberblock_load() skips distributed spares * when attempting to locate the best uberblock. */ if (zio->io_flags & ZIO_FLAG_PROBE) { zio->io_error = 0; } else { zio->io_error = SET_ERROR(EIO); } } else { cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL || !vdev_readable(cvd)) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } } break; case ZIO_TYPE_TRIM: /* The vdev label ranges are never trimmed */ ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); cvd = vdev_draid_spare_get_child(vd, offset); if (cvd == NULL || !cvd->vdev_has_trim) { zio->io_error = SET_ERROR(ENXIO); } else { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_draid_spare_child_done, zio)); } break; default: zio->io_error = SET_ERROR(ENOTSUP); break; } zio_execute(zio); } static void vdev_draid_spare_io_done(zio_t *zio) { (void) zio; } /* * Lookup the full spare config in spa->spa_spares.sav_config and * return the top_guid and spare_id for the named spare. */ static int vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, uint64_t *spare_idp) { nvlist_t **spares; uint_t nspares; int error; if ((spa->spa_spares.sav_config == NULL) || (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { return (SET_ERROR(ENOENT)); } const char *spare_name; error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); if (error != 0) return (SET_ERROR(EINVAL)); for (int i = 0; i < nspares; i++) { nvlist_t *spare = spares[i]; uint64_t top_guid, spare_id; const char *type, *path; /* Skip non-distributed spares */ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) continue; /* Skip spares with the wrong name */ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); if (error != 0 || strcmp(path, spare_name) != 0) continue; /* Found the matching spare */ error = nvlist_lookup_uint64(spare, ZPOOL_CONFIG_TOP_GUID, &top_guid); if (error == 0) { error = nvlist_lookup_uint64(spare, ZPOOL_CONFIG_SPARE_ID, &spare_id); } if (error != 0) { return (SET_ERROR(EINVAL)); } else { *top_guidp = top_guid; *spare_idp = spare_id; return (0); } } return (SET_ERROR(ENOENT)); } /* * Initialize private dRAID spare specific fields from the nvlist. */ static int vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) { vdev_draid_spare_t *vds; uint64_t top_guid = 0; uint64_t spare_id; /* * In the normal case check the list of spares stored in the spa * to lookup the top_guid and spare_id for provided spare config. * When creating a new pool or adding vdevs the spare list is not * yet populated and the values are provided in the passed config. */ if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, &spare_id) != 0) return (SET_ERROR(EINVAL)); } vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); vds->vds_draid_vdev = NULL; vds->vds_top_guid = top_guid; vds->vds_spare_id = spare_id; *tsd = vds; return (0); } static void vdev_draid_spare_fini(vdev_t *vd) { kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); } static void vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) { vdev_draid_spare_t *vds = vd->vdev_tsd; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); } vdev_ops_t vdev_draid_spare_ops = { .vdev_op_init = vdev_draid_spare_init, .vdev_op_fini = vdev_draid_spare_fini, .vdev_op_open = vdev_draid_spare_open, .vdev_op_close = vdev_draid_spare_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_draid_spare_io_start, .vdev_op_io_done = vdev_draid_spare_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_draid_spare_config_generate, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DRAID_SPARE, .vdev_op_leaf = B_TRUE, }; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 008e014ecfdc..f6e2662bd40f 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -1,833 +1,833 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Value that is written to disk during initialization. */ static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL; /* maximum number of I/Os outstanding per leaf vdev */ static const int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ static uint64_t zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || vd->vdev_detached || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding); } static void vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) { /* * We pass in the guid instead of the vdev_t since the vdev may * have been freed prior to the sync task being processed. This * happens when a vdev is detached as we call spa_config_vdev_exit(), * stop the initializing thread, schedule the sync task, and free * the vdev. Later when the scheduled sync task is invoked, it would * find that the vdev has been freed. */ uint64_t guid = *(uint64_t *)arg; uint64_t txg = dmu_tx_get_txg(tx); kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; vd->vdev_initialize_offset[txg & TXG_MASK] = 0; VERIFY(vd->vdev_leaf_zap != 0); objset_t *mos = vd->vdev_spa->spa_meta_objset; if (last_offset > 0) { vd->vdev_initialize_last_offset = last_offset; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (last_offset), 1, &last_offset, tx)); } if (vd->vdev_initialize_action_time > 0) { uint64_t val = (uint64_t)vd->vdev_initialize_action_time; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 1, &val, tx)); } uint64_t initialize_state = vd->vdev_initialize_state; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state, tx)); } static void vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) { uint64_t guid = *(uint64_t *)arg; kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) return; ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); ASSERT3U(vd->vdev_leaf_zap, !=, 0); vd->vdev_initialize_last_offset = 0; vd->vdev_initialize_action_time = 0; objset_t *mos = vd->vdev_spa->spa_meta_objset; int error; error = zap_remove(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); VERIFY(error == 0 || error == ENOENT); error = zap_remove(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); VERIFY(error == 0 || error == ENOENT); error = zap_remove(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); VERIFY(error == 0 || error == ENOENT); } static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); spa_t *spa = vd->vdev_spa; if (new_state == vd->vdev_initialize_state) return; /* * Copy the vd's guid, this will be freed by the sync task. */ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* * If we're suspending, then preserving the original start time. */ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); if (new_state != VDEV_INITIALIZE_NONE) { dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, guid, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_remove_sync, guid, tx); } switch (new_state) { case VDEV_INITIALIZE_ACTIVE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s activated", vd->vdev_path); break; case VDEV_INITIALIZE_SUSPENDED: spa_history_log_internal(spa, "initialize", tx, "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: if (old_state == VDEV_INITIALIZE_ACTIVE || old_state == VDEV_INITIALIZE_SUSPENDED) spa_history_log_internal(spa, "initialize", tx, "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; case VDEV_INITIALIZE_NONE: spa_history_log_internal(spa, "uninitialize", tx, "vdev=%s", vd->vdev_path); break; default: panic("invalid state %llu", (unsigned long long)new_state); } dmu_tx_commit(tx); if (new_state != VDEV_INITIALIZE_ACTIVE) spa_notify_waiters(spa); } static void vdev_initialize_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_initialize_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the vdev was unavailable; roll the * last offset back. (This works because spa_sync waits on * spa_txg_zio before it runs sync tasks.) */ uint64_t *off = &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else { /* * Since initializing is best-effort, we ignore I/O errors and * rely on vdev_probe to determine if the errors are more * critical. */ if (zio->io_error != 0) vd->vdev_stat.vs_initialize_errors++; vd->vdev_initialize_bytes_done += zio->io_orig_size; } ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; cv_broadcast(&vd->vdev_initialize_io_cv); mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* Takes care of physical writing and limiting # of concurrent ZIOs. */ static int vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) { spa_t *spa = vd->vdev_spa; /* Limit inflight initializing I/Os */ mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } vd->vdev_initialize_inflight++; mutex_exit(&vd->vdev_initialize_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_initialize_lock); if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, guid, tx); } /* * We know the vdev struct will still be around since all * consumers of vdev_free must stop the initialization first. */ if (vdev_initialize_should_stop(vd)) { mutex_enter(&vd->vdev_initialize_io_lock); ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_initialize_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_initialize_lock); vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); /* vdev_initialize_cb releases SCL_STATE_ALL */ dmu_tx_commit(tx); return (0); } /* * Callback to fill each ABD chunk with zfs_initialize_value. len must be * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD * allocation will guarantee these for us. */ static int vdev_initialize_block_fill(void *buf, size_t len, void *unused) { (void) unused; ASSERT0(len % sizeof (uint64_t)); for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; } return (0); } static abd_t * vdev_initialize_block_alloc(void) { /* Allocate ABD for filler data */ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, vdev_initialize_block_fill, NULL); return (data); } static void vdev_initialize_block_free(abd_t *data) { abd_free(data); } static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { zfs_range_tree_t *rt = vd->vdev_initialize_tree; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; for (zfs_range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; rs = zfs_btree_next(bt, &where, &where)) { uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = ((size - 1) / zfs_initialize_chunk_size) + 1; for (uint64_t w = 0; w < writes_required; w++) { int error; error = vdev_initialize_write(vd, VDEV_LABEL_START_SIZE + zfs_rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); if (error != 0) return (error); } } return (0); } static void -vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs) { uint64_t *last_rs_end = (uint64_t *)arg; if (physical_rs->rs_end > *last_rs_end) *last_rs_end = physical_rs->rs_end; } static void -vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = (vdev_t *)arg; uint64_t size = physical_rs->rs_end - physical_rs->rs_start; vd->vdev_initialize_bytes_est += size; if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { vd->vdev_initialize_bytes_done += size; } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && vd->vdev_initialize_last_offset < physical_rs->rs_end) { vd->vdev_initialize_bytes_done += vd->vdev_initialize_last_offset - physical_rs->rs_start; } } static void vdev_initialize_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); vd->vdev_initialize_bytes_est = 0; vd->vdev_initialize_bytes_done = 0; for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); uint64_t ms_free = (msp->ms_size - metaslab_allocated_space(msp)) / vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; /* Metaslab space after this offset has not been initialized */ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* Metaslab space before this offset has been initialized */ uint64_t last_rs_end = physical_rs.rs_end; if (!vdev_xlate_is_empty(&remain_rs)) { vdev_xlate_walk(vd, &remain_rs, vdev_initialize_xlate_last_rs_end, &last_rs_end); } if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* * If we get here, we're in the middle of initializing this * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ VERIFY0(metaslab_load(msp)); zfs_btree_index_t where; zfs_range_tree_t *rt = msp->ms_allocatable; for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { logical_rs.rs_start = zfs_rs_get_start(rs, rt); logical_rs.rs_end = zfs_rs_get_end(rs, rt); vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } } static int vdev_initialize_load(vdev_t *vd) { int err = 0; ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (vd->vdev_initialize_last_offset), 1, &vd->vdev_initialize_last_offset); if (err == ENOENT) { vd->vdev_initialize_last_offset = 0; err = 0; } } vdev_initialize_calculate_progress(vd); return (err); } static void -vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = arg; /* Only add segments that we have not visited yet */ if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, (u_longlong_t)physical_rs->rs_start, (u_longlong_t)physical_rs->rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, (u_longlong_t)physical_rs->rs_end); ASSERT3U(physical_rs->rs_end, >, vd->vdev_initialize_last_offset); physical_rs->rs_start = vd->vdev_initialize_last_offset; } ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); zfs_range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, physical_rs->rs_end - physical_rs->rs_start); } /* * Convert the logical range into a physical range and add it to our * avl tree. */ static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg64_t logical_rs; + zfs_range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static __attribute__((noreturn)) void vdev_initialize_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int error = 0; uint64_t ms_count = 0; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_initialize_last_offset = 0; VERIFY0(vdev_initialize_load(vd)); abd_t *deadbeef = vdev_initialize_block_alloc(); vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; boolean_t unload_when_done = B_FALSE; /* * If we've expanded the top-level vdev or it's our * first pass, calculate our progress. */ if (vd->vdev_top->vdev_ms_count != ms_count) { vdev_initialize_calculate_progress(vd); ms_count = vd->vdev_top->vdev_ms_count; } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); if (!msp->ms_loaded && !msp->ms_loading) unload_when_done = B_TRUE; VERIFY0(metaslab_load(msp)); zfs_range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd); mutex_exit(&msp->ms_lock); error = vdev_initialize_ranges(vd, deadbeef); metaslab_enable(msp, B_TRUE, unload_when_done); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zfs_range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight > 0) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } mutex_exit(&vd->vdev_initialize_io_lock); zfs_range_tree_destroy(vd->vdev_initialize_tree); vdev_initialize_block_free(deadbeef); vd->vdev_initialize_tree = NULL; mutex_enter(&vd->vdev_initialize_lock); if (!vd->vdev_initialize_exit_wanted) { if (vdev_writeable(vd)) { vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); } else if (vd->vdev_faulted) { vdev_initialize_change_state(vd, VDEV_INITIALIZE_CANCELED); } } ASSERT(vd->vdev_initialize_thread != NULL || vd->vdev_initialize_inflight == 0); /* * Drop the vdev_initialize_lock while we sync out the * txg since it's possible that a device might be trying to * come online and must check to see if it needs to restart an * initialization. That thread will be holding the spa_config_lock * which would prevent the txg_wait_synced from completing. */ mutex_exit(&vd->vdev_initialize_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&vd->vdev_initialize_lock); vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); thread_exit(); } /* * Initiates a device. Caller must hold vdev_initialize_lock. * Device must be a leaf and not already be initializing. */ void vdev_initialize(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } /* * Uninitializes a device. Caller must hold vdev_initialize_lock. * Device must be a leaf and not already be initializing. */ void vdev_uninitialize(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); } /* * Wait for the initialize thread to be terminated (cancelled or stopped). */ static void vdev_initialize_stop_wait_impl(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); while (vd->vdev_initialize_thread != NULL) cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); vd->vdev_initialize_exit_wanted = B_FALSE; } /* * Wait for vdev initialize threads which were either to cleanly exit. */ void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) { (void) spa; vdev_t *vd; ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop_wait_impl(vd); mutex_exit(&vd->vdev_initialize_lock); } } /* * Stop initializing a device, with the resultant initializing state being * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when * a list_t is provided the stopping vdev is inserted in to the list. Callers * are then required to call vdev_initialize_stop_wait() to block for all the * initialization threads to exit. The caller must hold vdev_initialize_lock * and must not be writing to the spa config, as the initializing thread may * try to enter the config as a reader before exiting. */ void vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list) { ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); /* * Allow cancel requests to proceed even if the initialize thread * has stopped. */ if (vd->vdev_initialize_thread == NULL && tgt_state != VDEV_INITIALIZE_CANCELED) { return; } vdev_initialize_change_state(vd, tgt_state); vd->vdev_initialize_exit_wanted = B_TRUE; if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { ASSERT(MUTEX_HELD(&spa_namespace_lock) || vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } static void vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, tgt_state, vd_list); mutex_exit(&vd->vdev_initialize_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, vd_list); } } /* * Convenience function to stop initializing of a vdev tree and set all * initialize thread pointers to NULL. */ void vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); vdev_initialize_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } list_destroy(&vd_list); } void vdev_initialize_restart(vdev_t *vd) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { mutex_enter(&vd->vdev_initialize_lock); uint64_t initialize_state = VDEV_INITIALIZE_NONE; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_state = initialize_state; uint64_t timestamp = 0; err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_restart(vd->vdev_child[i]); } } EXPORT_SYMBOL(vdev_initialize); EXPORT_SYMBOL(vdev_uninitialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); EXPORT_SYMBOL(vdev_initialize_restart); ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW, "Value written during zpool initialize"); ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW, "Size in bytes of writes by zpool initialize"); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 9d12bc2eb0a2..2c4e0c1c4848 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1,2166 +1,2167 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * Virtual Device Labels * --------------------- * * The vdev label serves several distinct purposes: * * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * * 2. Verify that all the devices given in a configuration are present * within the pool. * * 3. Determine the uberblock for the pool. * * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * * It is important to note that while the kernel is responsible for writing the * label, it only consumes the information in the first three cases. The * latter information is only consumed in userland when determining the * configuration to import a pool. * * * Label Organization * ------------------ * * Before describing the contents of the label, it's important to understand how * the labels are written and updated with respect to the uberblock. * * When the pool configuration is altered, either because it was newly created * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ * | | | | | | * | t10 | | t10 | | t10 | * | | | | | | * +------+ +------+ +------+ * * In this stable state, the labels and the uberblock were all updated within * the same transaction group (10). Each label is mirrored and checksummed, so * that we can detect when we fail partway through writing the label. * * In order to identify which labels are valid, the labels are written in the * following manner: * * 1. For each vdev, update 'L1' to the new label * 2. Update the uberblock * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the * UB, we will notice that L1's transaction group is greater than the uberblock, * so L2 must be valid. If we fail after writing the uberblock but before * writing L2, we will notice that L2's transaction group is less than L1, and * therefore L1 is valid. * * Another added complexity is that not every label is updated when the config * is synced. If we add a single device, we do not want to have to re-write * every label for every device in the pool. This means that both L1 and L2 may * be older than the pool uberblock, because the necessary information is stored * on another vdev. * * * On-disk Format * -------------- * * The vdev label consists of two distinct parts, and is wrapped within the * vdev_label_t structure. The label includes 8k of padding to permit legacy * VTOC disk labels, but is otherwise ignored. * * The first half of the label is a packed nvlist which contains pool wide * properties, per-vdev properties, and configuration information. It is * described in more detail below. * * The latter half of the label consists of a redundant array of uberblocks. * These uberblocks are updated whenever a transaction group is committed, * or when the configuration is updated. When a pool is loaded, we scan each * vdev for the 'best' uberblock. * * * Configuration Information * ------------------------- * * The nvlist describing the pool and vdev contains the following elements: * * version ZFS on-disk version * name Pool name * state Pool state * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * * top_guid Unique ID for top-level vdev in which this is contained * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. */ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } /* * Returns back the vdev label associated with the passed in offset. */ int vdev_label_number(uint64_t psize, uint64_t offset) { int l; if (offset >= psize - VDEV_LABEL_END_SIZE) { offset -= psize - VDEV_LABEL_END_SIZE; offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); } l = offset / sizeof (vdev_label_t); return (l < VDEV_LABELS ? l : -1); } static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } /* * Generate the nvlist representing this vdev's stats */ void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) { nvlist_t *nvx; vdev_stat_t *vs; vdev_stat_ex_t *vsx; vs = kmem_alloc(sizeof (*vs), KM_SLEEP); vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); vdev_get_stats_ex(vd, vs, vsx); fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); /* * Add extended stats into a special extended stats nvlist. This keeps * all the extended stats nicely grouped together. The extended stats * nvlist is then added to the main nvlist. */ nvx = fnvlist_alloc(); /* ZIOs in flight to disk */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]); /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]); /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD])); /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD])); /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); /* Direct I/O write verify errors */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS, vs->vs_dio_verify_errors); /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); fnvlist_free(nvx); kmem_free(vs, sizeof (*vs)); kmem_free(vsx, sizeof (*vsx)); } static void root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { spa_t *spa = vd->vdev_spa; if (vd != spa->spa_root_vdev) return; /* provide either current or previous scan information */ pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } pool_removal_stat_t prs; if (spa_removal_get_stats(spa, &prs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, sizeof (prs) / sizeof (uint64_t)); } pool_checkpoint_stat_t pcs; if (spa_checkpoint_get_stats(spa, &pcs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } pool_raidz_expand_stat_t pres; if (spa_raidz_expand_get_stats(spa, &pres) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, sizeof (pres) / sizeof (uint64_t)); } } static void top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { if (vd == vd->vdev_top) { vdev_rebuild_stat_t vrs; if (vdev_rebuild_get_stats(vd, &vrs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, sizeof (vrs) / sizeof (uint64_t)); } } } /* * Generate the nvlist representing this vdev's config. */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); if (vd->vdev_enc_sysfs_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_ops->vdev_op_config_generate != NULL) vd->vdev_ops->vdev_op_config_generate(vd, nv); if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); if (flags & VDEV_CONFIG_L2CACHE) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array); fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, vd->vdev_ms_shift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, vd->vdev_noalloc); } /* * Slog devices are removed synchronously so don't * persist the vdev_removing flag to the label. */ if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: bias = VDEV_ALLOC_BIAS_LOG; break; case VDEV_BIAS_SPECIAL: bias = VDEV_ALLOC_BIAS_SPECIAL; break; case VDEV_BIAS_DEDUP: bias = VDEV_ALLOC_BIAS_DEDUP; break; default: ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } } if (vd->vdev_dtl_sm != NULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, space_map_object(vd->vdev_dtl_sm)); } if (vic->vic_mapping_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, vic->vic_mapping_object); } if (vic->vic_births_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, vic->vic_births_object); } if (vic->vic_prev_indirect_vdev != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, vic->vic_prev_indirect_vdev); } if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); if (vd->vdev_expansion_time) fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME, vd->vdev_expansion_time); if (flags & VDEV_CONFIG_MOS) { if (vd->vdev_leaf_zap != 0) { ASSERT(vd->vdev_ops->vdev_op_leaf); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, vd->vdev_leaf_zap); } if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, vd->vdev_top_zap); } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 && spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, vd->vdev_root_zap); } if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER); } } if (getstats) { vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context * (spa_get_stats()), so we need the rwlock to prevent * the mapping from being changed by condensing. */ rw_enter(&vd->vdev_indirect_rwlock, RW_READER); if (vd->vdev_indirect_mapping != NULL) { ASSERT(vd->vdev_indirect_births != NULL); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, vdev_indirect_mapping_size(vim)); } rw_exit(&vd->vdev_indirect_rwlock); if (vd->vdev_mg != NULL && vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { /* * Compute approximately how much memory would be used * for the indirect mapping if this device were to * be removed. * * Note: If the frag metric is invalid, then not * enough metaslabs have been converted to have * histograms. */ uint64_t seg_count = 0; uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one * entry per free segment. However, small free * segments (smaller than vdev_removal_max_span) * will be combined with adjacent allocated segments * as a single mapping. */ - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; + i++) { if (i + 1 < highbit64(vdev_removal_max_span) - 1) { to_alloc += vd->vdev_mg->mg_histogram[i] << (i + 1); } else { seg_count += vd->vdev_mg->mg_histogram[i]; } } /* * The maximum length of a mapping is * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * sizeof (vdev_indirect_mapping_entry_phys_t)); } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); for (c = 0; c < vd->vdev_children; c++) { child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, vd->vdev_children); for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { const char *aux = NULL; if (vd->vdev_offline && !vd->vdev_tmpoffline) fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); if (vd->vdev_rebuild_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); if (vd->vdev_removed) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); if (vd->vdev_unspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); if (vd->vdev_ishole) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); /* Set the reason why we're FAULTED/DEGRADED. */ switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: aux = "err_exceeded"; break; case VDEV_AUX_EXTERNAL: aux = "external"; break; } if (aux != NULL && !vd->vdev_tmpoffline) { fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); } else { /* * We're healthy - clear any previous AUX_STATE values. */ if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); } if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, vd->vdev_orig_guid); } } return (nv); } /* * Generate a view of the top-level vdevs. If we currently have holes * in the namespace, then generate an array which contains a list of holey * vdevs. Additionally, add the number of top-level children that currently * exist. */ void vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) { array[idx++] = c; } } if (idx) { VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, array, idx) == 0); } VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, rvd->vdev_children) == 0); kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } /* * Returns the configuration from the label of the given vdev. For vdevs * which don't have a txg value stored on their label (i.e. spares/cache) * or have not been completely initialized (txg = 0) just return * the configuration from the first valid label we find. Otherwise, * find the most up-to-date label that does not exceed the specified * 'txg' value. */ nvlist_t * vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp[VDEV_LABELS]; abd_t *vp_abd[VDEV_LABELS]; zio_t *zio[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(vd->vdev_validate_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); /* * The label for a dRAID distributed spare is not stored on disk. * Instead it is generated when needed which allows us to bypass * the pipeline when reading the config from the label. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_read_config_spare(vd)); for (int l = 0; l < VDEV_LABELS; l++) { vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp[l] = abd_to_buf(vp_abd[l]); } retry: for (int l = 0; l < VDEV_LABELS; l++) { zio[l] = zio_root(spa, NULL, NULL, flags); vdev_label_read(zio[l], vd, l, vp_abd[l], offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); } for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; if (zio_wait(zio[l]) == 0 && nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been * completely initialized so just return the * configuration from the first valid label we * encounter. */ error = nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; for (l++; l < VDEV_LABELS; l++) zio_wait(zio[l]); break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; nvlist_free(config); config = fnvlist_dup(label); } } if (label != NULL) { nvlist_free(label); label = NULL; } } if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } /* * We found a valid label but it didn't pass txg restrictions. */ if (config == NULL && label_txg != 0) { vdev_dbgmsg(vd, "label discarded as txg is too large " "(%llu > %llu)", (u_longlong_t)label_txg, (u_longlong_t)txg); } for (int l = 0; l < VDEV_LABELS; l++) { abd_free(vp_abd[l]); } return (config); } /* * Determine if a device is in use. The 'spare_guid' parameter will be filled * in with the device guid if this spare is active elsewhere on the system. */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; uint64_t vdtxg = 0; nvlist_t *label; if (spare_guid) *spare_guid = 0ULL; if (l2cache_guid) *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. */ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &vdtxg); if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &device_guid) != 0) { nvlist_free(label); return (B_FALSE); } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &txg) != 0)) { nvlist_free(label); return (B_FALSE); } nvlist_free(label); /* * Check to see if this device indeed belongs to the pool it claims to * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && !spa_spare_exists(device_guid, NULL, NULL) && !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* * If the transaction group is zero, then this an initialized (but * unused) label. This is only an error if the create transaction * on-disk is the same as the one we're using now, in which case the * user has attempted to add the same vdev multiple times in the same * transaction. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares * to add. */ if (spa_spare_exists(device_guid, &spare_pool, NULL) || spa_has_spare(spa, device_guid)) { if (spare_guid) *spare_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_spare(spa, device_guid) || spare_pool != 0ULL); case VDEV_LABEL_SPARE: return (spa_has_spare(spa, device_guid)); default: break; } } /* * Check to see if this is an l2cache device. */ if (spa_l2cache_exists(device_guid, NULL) || spa_has_l2cache(spa, device_guid)) { if (l2cache_guid) *l2cache_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_l2cache(spa, device_guid)); case VDEV_LABEL_L2CACHE: return (spa_has_l2cache(spa, device_guid)); default: break; } } /* * We can't rely on a pool's state if it's been imported * read-only. Instead we look to see if the pools is marked * read-only in the namespace and set the state to active. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (spa = spa_by_guid(pool_guid, device_guid)) != NULL && spa_mode(spa) == SPA_MODE_READ) state = POOL_STATE_ACTIVE; /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ return (state == POOL_STATE_ACTIVE); } static nvlist_t * vdev_aux_label_generate(vdev_t *vd, boolean_t reason_spare) { /* * For inactive hot spares and level 2 ARC devices, we generate * a special label that identifies as a mutually shared hot * spare or l2cache device. We write the label in case of * addition or removal of hot spare or l2cache vdev (in which * case we want to revert the labels). */ nvlist_t *label = fnvlist_alloc(); fnvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(vd->vdev_spa)); fnvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE); fnvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid); /* * This is merely to facilitate reporting the ashift of the * cache device through zdb. The actual retrieval of the * ashift (in vdev_alloc()) uses the nvlist * spa->spa_l2cache->sav_config (populated in * spa_ld_open_aux_vdevs()). */ if (!reason_spare) fnvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); /* * Add path information to help find it during pool import */ if (vd->vdev_path != NULL) fnvlist_add_string(label, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(label, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) { fnvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); } return (label); } /* * Initialize a vdev label. We check to make sure each leaf device is not in * use, and writable. We put down an initial label which we will later * overwrite with a complete label. Note that it's important to do this * sequentially, not in parallel, so that we catch cases of multiple use of the * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with * itself. */ int vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) { spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)); boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); /* Track the creation time for this vdev */ vd->vdev_crtxg = crtxg; if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) return (0); /* * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) return (SET_ERROR(EIO)); /* * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (SET_ERROR(EBUSY)); /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the * guid (which was initialized to a random value) to reflect the * actual GUID (which is shared between multiple pools). */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && spare_guid != 0ULL) { uint64_t guid_delta = spare_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE || reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && l2cache_guid != 0ULL) { uint64_t guid_delta = l2cache_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding an l2cache, then it's * already labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_L2CACHE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE); } /* * Initialize its label. */ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. * We mark it as being from txg 0 to indicate that it's not * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ if (reason_spare || reason_l2cache) { label = vdev_aux_label_generate(vd, reason_spare); /* * When spare or l2cache (aux) vdev is added during pool * creation, spa->spa_uberblock is not written until this * point. Write it on next config sync. */ if (uberblock_verify(&spa->spa_uberblock)) spa->spa_aux_sync_uber = B_TRUE; } else { uint64_t txg = 0ULL; if (reason == VDEV_LABEL_SPLIT) txg = spa->spa_uberblock.ub_txg; label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple * vdev uses as described above, and automatically expires if we * fail. */ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0); } buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); } /* * Initialize uberblock template. */ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_RING - sizeof (uberblock_t)); ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. */ retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); /* * Skip the 1st padding area. * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ vdev_label_write(zio, vd, l, bootenv, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } nvlist_free(label); abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. Do the same for * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || spa_spare_exists(vd->vdev_guid, NULL, NULL))) spa_spare_add(vd); if (error == 0 && !vd->vdev_isl2cache && (reason == VDEV_LABEL_L2CACHE || spa_l2cache_exists(vd->vdev_guid, NULL))) spa_l2cache_add(vd); return (error); } /* * Done callback for vdev_label_read_bootenv_impl. If this is the first * callback to finish, store our abd in the callback pointer. Otherwise, we * just free our abd and return. */ static void vdev_label_read_bootenv_done(zio_t *zio) { zio_t *rio = zio->io_private; abd_t **cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); if (zio->io_error == 0) { mutex_enter(&rio->io_lock); if (*cbp == NULL) { /* Will free this buffer in vdev_label_read_bootenv. */ *cbp = zio->io_abd; } else { abd_free(zio->io_abd); } mutex_exit(&rio->io_lock); } else { abd_free(zio->io_abd); } } static void vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) { for (int c = 0; c < vd->vdev_children; c++) vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); /* * We just use the first label that has a correct checksum; the * bootloader should have rewritten them all to be the same on boot, * and any changes we made since boot have been the same across all * labels. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, vdev_label_read_bootenv_done, zio, flags); } } } int vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) { nvlist_t *config; spa_t *spa = rvd->vdev_spa; abd_t *abd = NULL; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(bootenv); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); zio_t *zio = zio_root(spa, NULL, &abd, flags); vdev_label_read_bootenv_impl(zio, rvd, flags); int err = zio_wait(zio); if (abd != NULL) { char *buf; vdev_boot_envblock_t *vbe = abd_to_buf(abd); vbe->vbe_version = ntohll(vbe->vbe_version); switch (vbe->vbe_version) { case VB_RAW: /* * if we have textual data in vbe_bootenv, create nvlist * with key "envmap". */ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; fnvlist_add_string(bootenv, GRUB_ENVMAP, vbe->vbe_bootenv); break; case VB_NVLIST: err = nvlist_unpack(vbe->vbe_bootenv, sizeof (vbe->vbe_bootenv), &config, 0); if (err == 0) { fnvlist_merge(bootenv, config); nvlist_free(config); break; } zfs_fallthrough; default: /* Check for FreeBSD zfs bootonce command string */ buf = abd_to_buf(abd); if (*buf == '\0') { fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_NVLIST); break; } fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); } /* * abd was allocated in vdev_label_read_bootenv_impl() */ abd_free(abd); /* * If we managed to read any successfully, * return success. */ return (0); } return (err); } int vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) { zio_t *zio; spa_t *spa = vd->vdev_spa; vdev_boot_envblock_t *bootenv; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; int error; size_t nvsize; char *nvbuf; const char *tmp; error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); if (error != 0) return (SET_ERROR(error)); if (nvsize >= sizeof (bootenv->vbe_bootenv)) { return (SET_ERROR(E2BIG)); } ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); error = ENXIO; for (int c = 0; c < vd->vdev_children; c++) { int child_err; child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); /* * As long as any of the disks managed to write all of their * labels successfully, return success. */ if (child_err == 0) error = child_err; } if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || !vdev_writeable(vd)) { return (error); } ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(abd, VDEV_PAD_SIZE); bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); nvbuf = bootenv->vbe_bootenv; nvsize = sizeof (bootenv->vbe_bootenv); bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); switch (bootenv->vbe_version) { case VB_RAW: if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) { (void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize); } error = 0; break; case VB_NVLIST: error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, KM_SLEEP); break; default: error = EINVAL; break; } if (error == 0) { bootenv->vbe_version = htonll(bootenv->vbe_version); abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); } else { abd_free(abd); return (SET_ERROR(error)); } retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, abd, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } abd_free(abd); return (error); } /* * ========================================================================== * uberblock load/sync * ========================================================================== */ /* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); /* * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware * ZFS, e.g. OpenZFS >= 0.7. * * If one ub has MMP and the other does not, they were written by * different hosts, which matters for MMP. So we treat no MMP/no SEQ as * a 0 value. * * Since timestamp and txg are the same if we get this far, either is * acceptable for importing the pool. */ unsigned int seq1 = 0; unsigned int seq2 = 0; if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) seq1 = MMP_SEQ(ub1); if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { uberblock_t ubl_latest; /* Most recent uberblock */ uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; static void vdev_uberblock_load_done(zio_t *zio) { vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { cbp->ubl_latest = *ub; } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* * Keep track of the vdev in which this uberblock * was found. We will use this information later * to obtain the config nvlist associated with * this uberblock. */ *cbp->ubl_ubbest = *ub; cbp->ubl_vd = vd; } mutex_exit(&rio->io_lock); } abd_free(zio->io_abd); } static void vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } } } } /* * Reads the 'best' uberblock from disk along with its associated * configuration. First, we read the uberblock array of each label of each * vdev, keeping track of the uberblock with the highest txg in each array. * Then, we read the configuration from the same vdev as the best uberblock. */ void vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) { zio_t *zio; spa_t *spa = rvd->vdev_spa; struct ubl_cbdata cb; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(ub); ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); vdev_uberblock_load_impl(zio, rvd, flags, &cb); (void) zio_wait(zio); /* * It's possible that the best uberblock was discovered on a label * that has a configuration which was written in a future txg. * Search all labels on this vdev to find the configuration that * matches the txg for our uberblock. */ if (cb.ubl_vd != NULL) { vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); if (ub->ub_raidz_reflow_info != cb.ubl_latest.ub_raidz_reflow_info) { vdev_dbgmsg(cb.ubl_vd, "spa=%s best uberblock (txg=%llu info=0x%llx) " "has different raidz_reflow_info than latest " "uberblock (txg=%llu info=0x%llx)", spa->spa_name, (u_longlong_t)ub->ub_txg, (u_longlong_t)ub->ub_raidz_reflow_info, (u_longlong_t)cb.ubl_latest.ub_txg, (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); memset(ub, 0, sizeof (uberblock_t)); spa_config_exit(spa, SCL_ALL, FTAG); return; } *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " "Trying again without txg restrictions."); *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); } if (*config == NULL) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); } } spa_config_exit(spa, SCL_ALL, FTAG); } /* * For use when a leaf vdev is expanded. * The location of labels 2 and 3 changed, and at the new location the * uberblock rings are either empty or contain garbage. The sync will write * new configs there because the vdev is dirty, but expansion also needs the * uberblock rings copied. Read them from label 0 which did not move. * * Since the point is to populate labels {2,3} with valid uberblocks, * we zero uberblocks we fail to read or which are not valid. */ static void vdev_copy_uberblocks(vdev_t *vd) { abd_t *ub_abd; zio_t *write_zio; int locks = (SCL_L2ARC | SCL_ZIO); int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* * No uberblocks are stored on distributed spares, they may be * safely skipped when expanding a leaf vdev. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { const int src_label = 0; zio_t *zio; zio = zio_root(vd->vdev_spa, NULL, NULL, flags); vdev_label_read(zio, vd, src_label, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); for (int l = 2; l < VDEV_LABELS; l++) vdev_label_write(write_zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags | ZIO_FLAG_DONT_PROPAGATE); } (void) zio_wait(write_zio); spa_config_exit(vd->vdev_spa, locks, FTAG); abd_free(ub_abd); } /* * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). */ static void vdev_uberblock_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) atomic_inc_64(good_writes); } /* * Write the uberblock to all labels of all leaves of the specified vdev. */ static void vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, uberblock_t *ub, vdev_t *vd, int flags) { for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_uberblock_sync(zio, good_writes, ub, vd->vdev_child[c], flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * There's no need to write uberblocks to a distributed spare, they * are already stored on all the leaves of the parent dRAID. For * this same reason vdev_uberblock_load_impl() skips distributed * spares when reading uberblocks. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { vdev_copy_uberblocks(vd); vd->vdev_copy_uberblocks = B_FALSE; } /* * We chose a slot based on the txg. If this uberblock has a special * RAIDZ expansion state, then it is essentially an update of the * current uberblock (it has the same txg). However, the current * state is committed, so we want to write it to a different slot. If * we overwrote the same slot, and we lose power during the uberblock * write, and the disk does not do single-sector overwrites * atomically (even though it is required to - i.e. we should see * either the old or the new uberblock), then we could lose this * txg's uberblock. Rewinding to the previous txg's uberblock may not * be possible because RAIDZ expansion may have already overwritten * some of the data, so we need the progress indicator in the * uberblock. */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); if (spa->spa_aux_sync_uber) { for (int v = 0; v < spa->spa_spares.sav_count; v++) { vdev_uberblock_sync(zio, &good_writes, ub, spa->spa_spares.sav_vdevs[v], flags); } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { vdev_uberblock_sync(zio, &good_writes, ub, spa->spa_l2cache.sav_vdevs[v], flags); } } (void) zio_wait(zio); /* * Flush the uberblocks to disk. This ensures that the odd labels * are no longer needed (because the new uberblocks and the even * labels are safely on disk), so it is safe to overwrite them. */ zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { zio_flush(zio, svd[v]); } } if (spa->spa_aux_sync_uber) { spa->spa_aux_sync_uber = B_FALSE; for (int v = 0; v < spa->spa_spares.sav_count; v++) { if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { zio_flush(zio, spa->spa_spares.sav_vdevs[v]); } } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); } } } (void) zio_wait(zio); return (good_writes >= 1 ? 0 : EIO); } /* * On success, increment the count of good writes for our top-level vdev. */ static void vdev_label_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) atomic_inc_64(good_writes); } /* * If there weren't enough good writes, indicate failure to the parent. */ static void vdev_label_sync_top_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (*good_writes == 0) zio->io_error = SET_ERROR(EIO); kmem_free(good_writes, sizeof (uint64_t)); } /* * We ignore errors for log and cache devices, simply free the private data. */ static void vdev_label_sync_ignore_done(zio_t *zio) { kmem_free(zio->io_private, sizeof (uint64_t)); } /* * Write all even or odd labels to all leaves of the specified vdev. */ static void vdev_label_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; char *buf; size_t buflen; vdev_t *pvd = vd->vdev_parent; boolean_t spare_in_use = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) { vdev_label_sync(zio, good_writes, vd->vdev_child[c], l, txg, flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * The top-level config never needs to be written to a distributed * spare. When read vdev_dspare_label_read_config() will generate * the config for the vdev_label_read_config(). */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; if (pvd && pvd->vdev_ops == &vdev_spare_ops) spare_in_use = B_TRUE; /* * Generate a label describing the top-level config to which we belong. */ if ((vd->vdev_isspare && !spare_in_use) || vd->vdev_isl2cache) { label = vdev_aux_label_generate(vd, vd->vdev_isspare); } else { label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); } vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } abd_free(vp_abd); nvlist_free(label); } static int vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { list_t *dl = &spa->spa_config_dirty_list; vdev_t *vd; zio_t *zio; int error; /* * Write the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes; ASSERT(!vd->vdev_ishole); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } /* * AUX path may have changed during import */ spa_aux_vdev_t *sav[2] = {&spa->spa_spares, &spa->spa_l2cache}; for (int i = 0; i < 2; i++) { for (int v = 0; v < sav[i]->sav_count; v++) { uint64_t *good_writes; if (!sav[i]->sav_label_sync) continue; good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, vdev_label_sync_ignore_done, good_writes, flags); vdev_label_sync(vio, good_writes, sav[i]->sav_vdevs[v], l, txg, flags); zio_nowait(vio); } } error = zio_wait(zio); /* * Flush the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) zio_flush(zio, vd); for (int i = 0; i < 2; i++) { if (!sav[i]->sav_label_sync) continue; for (int v = 0; v < sav[i]->sav_count; v++) zio_flush(zio, sav[i]->sav_vdevs[v]); if (l == 1) sav[i]->sav_label_sync = B_FALSE; } (void) zio_wait(zio); return (error); } /* * Sync the uberblock and any changes to the vdev configuration. * * The order of operations is carefully crafted to ensure that * if the system panics or loses power at any time, the state on disk * is still transactionally consistent. The in-line comments below * describe the failure semantics at each stage. * * Moreover, vdev_config_sync() is designed to be idempotent: if it fails * at any time, you can just call it again, and it will resume its work. */ int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(svdcount != 0); retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the * sync process to block while we retry. But if we can't write a * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } ASSERT(ub->ub_txg <= txg); /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ if (ub->ub_txg < txg) { boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); if (!changed && list_is_empty(&spa->spa_config_dirty_list) && !spa_multihost(spa)) return (0); } if (txg > spa_freeze_txg(spa)) return (0); ASSERT(txg <= spa->spa_final_txg); /* * Flush the write cache of every disk that's been written to * in this transaction group. This ensures that all blocks * written in this txg will be committed to stable storage * before any uberblock that references them. */ zio_t *zio = zio_root(spa, NULL, NULL, flags); for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); (void) zio_wait(zio); /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the * even labels that made it to disk will be newer than any uberblock, * and will therefore be considered invalid. The odd labels (L1, L3), * which have not yet been touched, will still be valid. We flush * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the even labels " "of dirty vdevs", error, spa_name(spa)); } goto retry; } /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases * to consider, and the on-disk state is consistent either way: * * (1) If none of the new uberblocks made it to disk, then the * previous uberblock will be the newest, and the odd labels * (which had not yet been touched) will be valid with respect * to that uberblock. * * (2) If one or more new uberblocks made it to disk, then they * will be the newest, and the even labels (which had all * been successfully committed) will be valid with respect * to the new uberblocks. */ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_uberblock_sync_list() returned error " "%d for pool '%s'", error, spa_name(spa)); } goto retry; } if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new * uberblocks will suffice to open the pool. The next time * the pool is opened, the first thing we'll do -- before any * user data is modified -- is mark every vdev dirty so that * all labels will be brought up to date. We flush the new labels * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the odd labels of " "dirty vdevs", error, spa_name(spa)); } goto retry; } return (0); } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6bac2241c6d8..59225e766ba1 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1,5123 +1,5123 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden NeÅ¡ković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } /* * Big Theory Statement for how a RAIDZ VDEV is expanded * * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs * that have been previously expanded can be expanded again. * * The RAIDZ VDEV must be healthy (must be able to write to all the drives in * the VDEV) when an expansion starts. And the expansion will pause if any * disk in the VDEV fails, and resume once the VDEV is healthy again. All other * operations on the pool can continue while an expansion is in progress (e.g. * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, * and zpool initialize which can't be run during an expansion. Following a * reboot or export/import, the expansion resumes where it left off. * * == Reflowing the Data == * * The expansion involves reflowing (copying) the data from the current set * of disks to spread it across the new set which now has one more disk. This * reflow operation is similar to reflowing text when the column width of a * text editor window is expanded. The text doesn’t change but the location of * the text changes to accommodate the new width. An example reflow result for * a 4-wide RAIDZ1 to a 5-wide is shown below. * * Reflow End State * Each letter indicates a parity group (logical stripe) * * Before expansion After Expansion * D1 D2 D3 D4 D1 D2 D3 D4 D5 * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | A | A | A | A | | A | A | A | A | B | * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | B | B | C | C | | B | C | C | C | C | * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | C | C | D | D | | D | D | E | E | E | * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | E | E | E | E | --> | E | F | F | G | G | * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | F | F | G | G | | G | G | H | H | H | * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | G | G | H | H | | H | I | I | J | J | * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | H | H | I | I | | J | J | | | K | * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| * +------+------+------+------+ +------+------+------+------+------+ * * This reflow approach has several advantages. There is no need to read or * modify the block pointers or recompute any block checksums. The reflow * doesn’t need to know where the parity sectors reside. We can read and write * data sequentially and the copy can occur in a background thread in open * context. The design also allows for fast discovery of what data to copy. * * The VDEV metaslabs are processed, one at a time, to copy the block data to * have it flow across all the disks. The metaslab is disabled for allocations * during the copy. As an optimization, we only copy the allocated data which * can be determined by looking at the metaslab range tree. During the copy we * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still * need to be able to survive losing parity count disks). This means we * cannot overwrite data during the reflow that would be needed if a disk is * lost. * * After the reflow completes, all newly-written blocks will have the new * layout, i.e., they will have the parity to data ratio implied by the new * number of disks in the RAIDZ group. Even though the reflow copies all of * the allocated space (data and parity), it is only rearranged, not changed. * * This act of reflowing the data has a few implications about blocks * that were written before the reflow completes: * * - Old blocks will still use the same amount of space (i.e., they will have * the parity to data ratio implied by the old number of disks in the RAIDZ * group). * - Reading old blocks will be slightly slower than before the reflow, for * two reasons. First, we will have to read from all disks in the RAIDZ * VDEV, rather than being able to skip the children that contain only * parity of this block (because the data of a single block is now spread * out across all the disks). Second, in most cases there will be an extra * bcopy, needed to rearrange the data back to its original layout in memory. * * == Scratch Area == * * As we copy the block data, we can only progress to the point that writes * will not overlap with blocks whose progress has not yet been recorded on * disk. Since partially-copied rows are always read from the old location, * we need to stop one row before the sector-wise overlap, to prevent any * row-wise overlap. For example, in the diagram above, when we reflow sector * B6 it will overwite the original location for B5. * * To get around this, a scratch space is used so that we can start copying * without risking data loss by overlapping the row. As an added benefit, it * improves performance at the beginning of the reflow, but that small perf * boost wouldn't be worth the complexity on its own. * * Ideally we want to copy at least 2 * (new_width)^2 so that we have a * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice * the widths will likely be single digits so we can get a substantial chuck * size using only a few MB of scratch per disk. * * The scratch area is persisted to disk which holds a large amount of reflowed * state. We can always read the partially written stripes when a disk fails or * the copy is interrupted (crash) during the initial copying phase and also * get past a small chunk size restriction. At a minimum, the scratch space * must be large enough to get us to the point that one row does not overlap * itself when moved (i.e new_width^2). But going larger is even better. We * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels * as our scratch space to handle overwriting the initial part of the VDEV. * * 0 256K 512K 4M * +------+------+-----------------------+----------------------------- * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... * | L0 | L1 | Reserved | (Metaslabs) * +------+------+-----------------------+------------------------------- * Scratch Area * * == Reflow Progress Updates == * After the initial scratch-based reflow, the expansion process works * similarly to device removal. We create a new open context thread which * reflows the data, and periodically kicks off sync tasks to update logical * state. In this case, state is the committed progress (offset of next data * to copy). We need to persist the completed offset on disk, so that if we * crash we know which format each VDEV offset is in. * * == Time Dependent Geometry == * * In non-expanded RAIDZ, blocks are read from disk in a column by column * fashion. For a multi-row block, the second sector is in the first column * not in the second column. This allows us to issue full reads for each * column directly into the request buffer. The block data is thus laid out * sequentially in a column-by-column fashion. * * For example, in the before expansion diagram above, one logical block might * be sectors G19-H26. The parity is in G19,H23; and the data is in * G20,H24,G21,H25,G22,H26. * * After a block is reflowed, the sectors that were all in the original column * data can now reside in different columns. When reading from an expanded * VDEV, we need to know the logical stripe width for each block so we can * reconstitute the block’s data after the reads are completed. Likewise, * when we perform the combinatorial reconstruction we need to know the * original width so we can retry combinations from the past layouts. * * Time dependent geometry is what we call having blocks with different layouts * (stripe widths) in the same VDEV. This time-dependent geometry uses the * block’s birth time (+ the time expansion ended) to establish the correct * width for a given block. After an expansion completes, we record the time * for blocks written with a particular width (geometry). * * == On Disk Format Changes == * * New pool feature flag, 'raidz_expansion' whose reference count is the number * of RAIDZ VDEVs that have been expanded. * * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. * * Since the uberblock can point to arbitrary blocks, which might be on the * expanding RAIDZ, and might or might not have been expanded. We need to know * which way a block is laid out before reading it. This info is the next * offset that needs to be reflowed and we persist that in the uberblock, in * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. * After the expansion is complete, we then use the raidz_expand_txgs array * (see below) to determine how to read a block and the ub_raidz_reflow_info * field no longer required. * * The uberblock's ub_raidz_reflow_info field also holds the scratch space * state (i.e., active or not) which is also required before reading a block * during the initial phase of reflowing the data. * * The top-level RAIDZ VDEV has two new entries in the nvlist: * * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here * and used after the expansion is complete to * determine how to read a raidz block * 'raidz_expanding' boolean: present during reflow and removed after completion * used during a spa import to resume an unfinished * expansion * * And finally the VDEVs top zap adds the following informational entries: * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED */ /* * For testing only: pause the raidz expansion after reflowing this amount. * (accessed by ZTS and ztest) */ #ifdef _KERNEL static #endif /* _KERNEL */ unsigned long raidz_expand_max_reflow_bytes = 0; /* * For testing only: pause the raidz expansion at a certain point. */ uint_t raidz_expand_pause_point = 0; /* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; #else static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; #endif /* * Apply raidz map abds aggregation if the number of rows in the map is equal * or greater than the value below. */ static unsigned long raidz_io_aggregate_rows = 4; /* * Automatically start a pool scrub when a RAIDZ expansion completes in * order to verify the checksums of all blocks which have been copied * during the expansion. Automatic scrubbing is enabled by default and * is strongly recommended. */ static int zfs_scrub_after_expand = 1; static void vdev_raidz_row_free(raidz_row_t *rr) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size != 0) abd_free(rc->rc_abd); if (rc->rc_orig_data != NULL) abd_free(rc->rc_orig_data); } if (rr->rr_abd_empty != NULL) abd_free(rr->rr_abd_empty); kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); } void vdev_raidz_map_free(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); if (rm->rm_nphys_cols) { for (int i = 0; i < rm->rm_nphys_cols; i++) { if (rm->rm_phys_col[i].rc_abd != NULL) abd_free(rm->rm_phys_col[i].rc_abd); } kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); } ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; vdev_raidz_map_free(rm); } static int vdev_raidz_reflow_compare(const void *x1, const void *x2) { const reflow_node_t *l = x1; const reflow_node_t *r = x2; return (TREE_CMP(l->re_txg, r->re_txg)); } const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; raidz_row_t * vdev_raidz_row_alloc(int cols, zio_t *zio) { raidz_row_t *rr = kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); rr->rr_cols = cols; rr->rr_scols = cols; for (int c = 0; c < cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_shadow_devidx = INT_MAX; rc->rc_shadow_offset = UINT64_MAX; /* * We can not allow self healing to take place for Direct I/O * reads. There is nothing that stops the buffer contents from * being manipulated while the I/O is in flight. It is possible * that the checksum could be verified on the buffer and then * the contents of that buffer are manipulated afterwards. This * could lead to bad data being written out during self * healing. */ if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) rc->rc_allow_repair = 1; } return (rr); } static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { int c; int nwrapped = 0; uint64_t off = 0; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(rm->rm_nrows, ==, 1); /* * Pad any parity columns with additional space to account for skip * sectors. */ if (rm->rm_skipstart < rr->rr_firstdatacol) { ASSERT0(rm->rm_skipstart); nwrapped = rm->rm_nskip; } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { nwrapped = (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; } /* * Optional single skip sectors (rc_size == 0) will be handled in * vdev_raidz_io_start_write(). */ int skipped = rr->rr_scols - rr->rr_cols; /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * Parity columns will pad out a linear ABD to account for * the skip sector. A linear ABD is used here because * parity calculations use the ABD buffer directly to calculate * parity. This avoids doing a memcpy back to the ABD after the * parity has been calculated. By issuing the parity column * with the skip sector we can reduce contention on the child * VDEV queue locks (vq_lock). */ if (c < nwrapped) { rc->rc_abd = abd_alloc_linear( rc->rc_size + (1ULL << ashift), B_FALSE); abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); skipped++; } else { rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } } for (off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); /* * Generate I/O for skip sectors to improve aggregation * continuity. We will use gang ABD's to reduce contention * on the child VDEV queue locks (vq_lock) by issuing * a single I/O that contains the data and skip sector. * * It is important to make sure that rc_size is not updated * even though we are adding a skip sector to the ABD. When * calculating the parity in vdev_raidz_generate_parity_row() * the rc_size is used to iterate through the ABD's. We can * not have zero'd out skip sectors used for calculating * parity for raidz, because those same sectors are not used * during reconstruction. */ if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(1ULL << ashift), B_TRUE); skipped++; } else { rc->rc_abd = abd; } off += rc->rc_size; } ASSERT3U(off, ==, zio->io_size); ASSERT3S(skipped, ==, rm->rm_nskip); } static void vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) { int c; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(rm->rm_nrows, ==, 1); /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); for (uint64_t off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); off += rc->rc_size; } } /* * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. * * Avoid inlining the function to keep vdev_raidz_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); rm->rm_nrows = 1; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. * scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rr = vdev_raidz_row_alloc(scols, zio); rm->rm_row[0] = rr; rr->rr_cols = acols; rr->rr_bigcols = bc; rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif uint64_t asize = 0; for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; uint64_t col = f + c; uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; if (c >= acols) rc->rc_size = 0; else if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_map_alloc_write(zio, rm, ashift); } else { vdev_raidz_map_alloc_read(zio, rm); } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } /* * Everything before reflow_offset_synced should have been moved to the new * location (read and write completed). However, this may not yet be reflected * in the on-disk format (e.g. raidz_reflow_sync() has been called but the * uberblock has not yet been written). If reflow is not in progress, * reflow_offset_synced should be UINT64_MAX. For each row, if the row is * entirely before reflow_offset_synced, it will come from the new location. * Otherwise this row will come from the old location. Therefore, rows that * straddle the reflow_offset_synced will come from the old location. * * For writes, reflow_offset_next is the next offset to copy. If a sector has * been copied, but not yet reflected in the on-disk progress * (reflow_offset_synced), it will also be written to the new (already copied) * offset. */ noinline raidz_map_t * vdev_raidz_map_alloc_expanded(zio_t *zio, uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, uint64_t nparity, uint64_t reflow_offset_synced, uint64_t reflow_offset_next, boolean_t use_scratch) { abd_t *abd = zio->io_abd; uint64_t offset = zio->io_offset; uint64_t size = zio->io_size; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> ashift; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. * AKA "full rows" */ uint64_t q = s / (logical_cols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (logical_cols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* How many rows contain data (not skip) */ uint64_t rows = howmany(tot, logical_cols); int cols = MIN(tot, logical_cols); raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), KM_SLEEP); rm->rm_nrows = rows; rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; uint64_t asize = 0; for (uint64_t row = 0; row < rows; row++) { boolean_t row_use_scratch = B_FALSE; raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); rm->rm_row[row] = rr; /* The starting RAIDZ (parent) vdev sector of the row. */ uint64_t b = (offset >> ashift) + row * logical_cols; /* * If we are in the middle of a reflow, and the copying has * not yet completed for any part of this row, then use the * old location of this row. Note that reflow_offset_synced * reflects the i/o that's been completed, because it's * updated by a synctask, after zio_wait(spa_txg_zio[]). * This is sufficient for our check, even if that progress * has not yet been recorded to disk (reflected in * spa_ubsync). Also note that we consider the last row to * be "full width" (`cols`-wide rather than `bc`-wide) for * this calculation. This causes a tiny bit of unnecessary * double-writes but is safe and simpler to calculate. */ int row_phys_cols = physical_cols; if (b + cols > reflow_offset_synced >> ashift) row_phys_cols--; else if (use_scratch) row_use_scratch = B_TRUE; /* starting child of this row */ uint64_t child_id = b % row_phys_cols; /* The starting byte offset on each child vdev. */ uint64_t child_offset = (b / row_phys_cols) << ashift; /* * Note, rr_cols is the entire width of the block, even * if this row is shorter. This is needed because parity * generation (for Q and R) needs to know the entire width, * because it treats the short row as though it was * full-width (and the "phantom" sectors were zero-filled). * * Another approach to this would be to set cols shorter * (to just the number of columns that we might do i/o to) * and have another mechanism to tell the parity generation * about the "entire width". Reconstruction (at least * vdev_raidz_reconstruct_general()) would also need to * know about the "entire width". */ rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG /* * note: rr_size is PSIZE, not ASIZE */ rr->rr_offset = b << ashift; rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; #endif for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { child_id -= row_phys_cols; child_offset += 1ULL << ashift; } raidz_col_t *rc = &rr->rr_col[c]; rc->rc_devidx = child_id; rc->rc_offset = child_offset; /* * Get this from the scratch space if appropriate. * This only happens if we crashed in the middle of * raidz_reflow_scratch_sync() (while it's running, * the rangelock prevents us from doing concurrent * io), and even then only during zpool import or * when the pool is imported readonly. */ if (row_use_scratch) rc->rc_offset -= VDEV_BOOT_SIZE; uint64_t dc = c - rr->rr_firstdatacol; if (c < rr->rr_firstdatacol) { rc->rc_size = 1ULL << ashift; /* * Parity sectors' rc_abd's are set below * after determining if this is an aggregation. */ } else if (row == rows - 1 && bc != 0 && c >= bc) { /* * Past the end of the block (even including * skip sectors). This sector is part of the * map so that we have full rows for p/q parity * generation. */ rc->rc_size = 0; rc->rc_abd = NULL; } else { /* "data column" (col excluding parity) */ uint64_t off; if (c < bc || r == 0) { off = dc * rows + row; } else { off = r * rows + (dc - r) * (rows - 1) + row; } rc->rc_size = 1ULL << ashift; rc->rc_abd = abd_get_offset_struct( &rc->rc_abdstruct, abd, off << ashift, rc->rc_size); } if (rc->rc_size == 0) continue; /* * If any part of this row is in both old and new * locations, the primary location is the old * location. If this sector was already copied to the * new location, we need to also write to the new, * "shadow" location. * * Note, `row_phys_cols != physical_cols` indicates * that the primary location is the old location. * `b+c < reflow_offset_next` indicates that the copy * to the new location has been initiated. We know * that the copy has completed because we have the * rangelock, which is held exclusively while the * copy is in progress. */ if (row_use_scratch || (row_phys_cols != physical_cols && b + c < reflow_offset_next >> ashift)) { rc->rc_shadow_devidx = (b + c) % physical_cols; rc->rc_shadow_offset = ((b + c) / physical_cols) << ashift; if (row_use_scratch) rc->rc_shadow_offset -= VDEV_BOOT_SIZE; } asize += rc->rc_size; } /* * See comment in vdev_raidz_map_alloc() */ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && (offset & (1ULL << 20))) { ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); int devidx0 = rr->rr_col[0].rc_devidx; uint64_t offset0 = rr->rr_col[0].rc_offset; int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; uint64_t shadow_offset0 = rr->rr_col[0].rc_shadow_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[0].rc_shadow_devidx = rr->rr_col[1].rc_shadow_devidx; rr->rr_col[0].rc_shadow_offset = rr->rr_col[1].rc_shadow_offset; rr->rr_col[1].rc_devidx = devidx0; rr->rr_col[1].rc_offset = offset0; rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; rr->rr_col[1].rc_shadow_offset = shadow_offset0; } } ASSERT3U(asize, ==, tot << ashift); /* * Determine if the block is contiguous, in which case we can use * an aggregation. */ if (rows >= raidz_io_aggregate_rows) { rm->rm_nphys_cols = physical_cols; rm->rm_phys_col = kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, KM_SLEEP); /* * Determine the aggregate io's offset and size, and check * that the io is contiguous. */ for (int i = 0; i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; if (rc->rc_size == 0) continue; if (prc->rc_size == 0) { ASSERT0(prc->rc_offset); prc->rc_offset = rc->rc_offset; } else if (prc->rc_offset + prc->rc_size != rc->rc_offset) { /* * This block is not contiguous and * therefore can't be aggregated. * This is expected to be rare, so * the cost of allocating and then * freeing rm_phys_col is not * significant. */ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); rm->rm_phys_col = NULL; rm->rm_nphys_cols = 0; break; } prc->rc_size += rc->rc_size; } } } if (rm->rm_phys_col != NULL) { /* * Allocate aggregate ABD's. */ for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; prc->rc_devidx = i; if (prc->rc_size == 0) continue; prc->rc_abd = abd_alloc_linear(rm->rm_phys_col[i].rc_size, B_FALSE); } /* * Point the parity abd's into the aggregate abd's. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, prc->rc_abd, rc->rc_offset - prc->rc_offset, rc->rc_size); } } } else { /* * Allocate new abd's for the parity sectors. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_alloc_linear(rc->rc_size, B_TRUE); } } } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } struct pqr_struct { uint64_t *p; uint64_t *q; uint64_t *r; }; static int vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); } static int vdev_raidz_pq_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; } return (0); } static int vdev_raidz_pqr_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; VDEV_RAIDZ_64MUL_4(*pqr->r, mask); *pqr->r ^= *src; } return (0); } static void vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_R].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); (void) memcpy(r, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { if (rr->rr_cols == 0) { /* * We are handling this block one row at a time (because * this block has a different logical vs physical width, * due to RAIDZ expansion), and this is a pad-only row, * which has no parity. */ return; } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; switch (rr->rr_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rr); break; case 2: vdev_raidz_generate_parity_pq(rr); break; case 3: vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } void vdev_raidz_generate_parity(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_generate_parity_row(rm, rr); } } static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); for (int i = 0; i < cnt; i++) { dst[i] ^= src[i]; } return (0); } static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } return (0); } static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { (void) private; uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++) { /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ VDEV_RAIDZ_64MUL_2(*dst, mask); } return (0); } struct reconst_q_struct { uint64_t *q; int exp; }; static int vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) { struct reconst_q_struct *rq = private; uint64_t *dst = buf; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { int j; uint8_t *b; *dst ^= *rq->q; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, rq->exp); } } return (0); } struct reconst_pq_struct { uint8_t *p; uint8_t *q; uint8_t *pxy; uint8_t *qxy; int aexp; int bexp; }; static int vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; uint8_t *yd = ybuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); *yd = *rpq->p ^ *rpq->pxy ^ *xd; } return (0); } static int vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { /* same operation as vdev_raidz_reconst_pq_func() on xd */ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); } return (0); } static void vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; dst = rr->rr_col[x].rc_abd; abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; if (c == x) continue; (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } } static void vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; dst = rr->rr_col[x].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, rr->rr_col[x].rc_size - size); } } else { ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; dst = rr->rr_col[x].rc_abd; exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); } static void vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; abd_t *xd, *yd; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); ASSERT(y < rr->rr_cols); ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; xsize = rr->rr_col[x].rc_size; ysize = rr->rr_col[y].rc_size; rr->rr_col[VDEV_RAIDZ_P].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); rr->rr_col[VDEV_RAIDZ_Q].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rr->rr_col[x].rc_size = 0; rr->rr_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rr); rr->rr_col[x].rc_size = xsize; rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); xd = rr->rr_col[x].rc_abd; yd = rr->rr_col[y].rc_abd; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ASSERT3U(xsize, >=, ysize); struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; (void) abd_iterate_func2(xd, yd, 0, 0, ysize, vdev_raidz_reconst_pq_func, &rpq); (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; } /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ static void vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rr->rr_firstdatacol); jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rr->rr_cols); ccount = rr->rr_col[c].rc_size; ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); if (ccount == 0) continue; src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { cc = missing[j] + rr->rr_firstdatacol; ASSERT3U(cc, >=, rr->rr_firstdatacol); ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); dcount[j] = rr->rr_col[cc].rc_size; if (dcount[j] != 0) dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int i, c, t, tt; unsigned int n; unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; bufs[c] = col->rc_abd; if (bufs[c] != NULL) { col->rc_abd = abd_alloc_linear( col->rc_size, B_TRUE); abd_copy(col->rc_abd, bufs[c], col->rc_size); } } break; } } n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rr->rr_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } parity_map[i] = c; i++; } psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); /* * copy back from temporary linear abds and free them */ if (bufs) { for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; if (bufs[c] != NULL) { abd_copy(bufs[c], col->rc_abd, col->rc_size); abd_free(col->rc_abd); } col->rc_abd = bufs[c]; } kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } } static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity); } nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " "offset=%llx error=%u)", rr, c, (int)rr->rr_col[c].rc_devidx, (long long)rr->rr_col[c].rc_offset, (int)rr->rr_col[c].rc_error); } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) { vdev_raidz_reconstruct_p(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_q(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 2); break; case 2: ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_pq(rr, dt, 2); return; } ASSERT(rr->rr_firstdatacol > 2); break; } vdev_raidz_reconstruct_general(rr, tgts, ntgts); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) continue; *physical_ashift = vdev_best_ashift(*logical_ashift, *physical_ashift, cvd->vdev_physical_ashift); } if (vd->vdev_rz_expanding) { *asize *= vd->vdev_children - 1; *max_asize *= vd->vdev_children - 1; vd->vdev_min_asize = *asize; } else { *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } /* * Return the logical width to use, given the txg in which the allocation * happened. Note that BP_GET_BIRTH() is usually the txg in which the * BP was allocated. Remapped BP's (that were relocated due to device * removal, see remap_blkptr_cb()), will have a more recent physical birth * which reflects when the BP was relocated, but we can ignore these because * they can't be on RAIDZ (device removal doesn't support RAIDZ). */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) { reflow_node_t lookup = { .re_txg = txg, }; avl_index_t where; uint64_t width; mutex_enter(&vdrz->vd_expand_lock); reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); if (re != NULL) { width = re->re_logical_width; } else { re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); if (re != NULL) width = re->re_logical_width; else width = vdrz->vd_original_width; } mutex_exit(&vdrz->vd_expand_lock); return (width); } /* * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated * more space due to the lower data-to-parity ratio. In this case it's * important to pass in the correct txg. Note that vdev_gang_header_asize() * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, * regardless of txg. This is assured because for a single data sector, we * allocate P+1 sectors regardless of width ("cols", which is at least P+1). */ static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; cols = vdev_raidz_get_logical_width(vdrz, txg); asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; #ifdef ZFS_DEBUG uint64_t asize_new = ((psize - 1) >> ashift) + 1; uint64_t ncols_new = vdrz->vd_physical_width; asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / (ncols_new - nparity)); asize_new = roundup(asize_new, nparity + 1) << ashift; VERIFY3U(asize_new, <=, asize); #endif return (asize); } /* * The allocatable space for a raidz vdev is N * sizeof(smallest child) * so each child must provide at least 1/Nth of its asize. */ static uint64_t vdev_raidz_min_asize(vdev_t *vd) { return ((vd->vdev_min_asize + vd->vdev_children - 1) / vd->vdev_children); } void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; ASSERT3P(rc->rc_abd, !=, NULL); rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_shadow_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_shadow_error = zio->io_error; } static void vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) { (void) rm; #ifdef ZFS_DEBUG - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, rr->rr_size, BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); if (vdev_xlate_is_empty(&physical_rs)) { /* * If we are in the middle of expansion, the * physical->logical mapping is changing so vdev_xlate() * can't give us a reliable answer. */ return; } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* * It would be nice to assert that rs_end is equal * to rc_offset + rc_size but there might be an * optional I/O at the end that is not accounted in * rc_size. */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } #endif } static void vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ vdev_raidz_io_verify(zio, rm, rr, c); if (rc->rc_size == 0) continue; ASSERT3U(rc->rc_offset + rc->rc_size, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3P(rc->rc_abd, !=, NULL); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); if (rc->rc_shadow_devidx != INT_MAX) { vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; ASSERT3U( rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, cvd2->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, rc->rc_shadow_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_shadow_child_done, rc)); } } } /* * Generate optional I/Os for skip sectors to improve aggregation contiguity. * This only works for vdev_raidz_map_alloc() (not _expanded()). */ static void raidz_start_skip_writes(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t ashift = vd->vdev_top->vdev_ashift; raidz_map_t *rm = zio->io_vsd; ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_size != 0) continue; ASSERT3P(rc->rc_abd, ==, NULL); ASSERT3U(rc->rc_offset, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } } static void vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } static void vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) { vdev_t *vd = zio->io_vd; for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; if (prc->rc_size == 0) continue; ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ prc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { prc->rc_error = SET_ERROR(ESTALE); prc->rc_skipped = 1; continue; } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, prc->rc_offset, prc->rc_abd, prc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, prc)); } } static void vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) { /* * If there are multiple rows, we will be hitting * all disks, so go ahead and read the parity so * that we are reading in decent size chunks. */ boolean_t forceparity = rm->rm_nrows > 1; if (rm->rm_phys_col) { vdev_raidz_io_start_read_phys_cols(zio, rm); } else { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_start_read_row(zio, rr, forceparity); } } } /* * Start an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. * 3. If the column skips any sectors for padding, create optional dummy * write zio children for those areas to improve aggregation continuity. * - For read operations: * 1. Create child zio read operations to each data column's vdev to read * the range of data required for zio. * 2. If this is a scrub or resilver operation, or if any of the data * vdevs have had errors, then create zio read operations to the parity * columns' VDevs as well. */ static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, BP_GET_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; uint64_t next_offset = UINT64_MAX; boolean_t use_scratch = B_FALSE; /* * Note: when the expansion is completing, we set * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) * in a later txg than when we last update spa_ubsync's state * (see the end of spa_raidz_expand_thread()). Therefore we * may see vre_state!=SCANNING before * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected * on disk, but the copying progress has been synced to disk * (and reflected in spa_ubsync). In this case it's fine to * treat the expansion as completed, since if we crash there's * no additional copying to do. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) { ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, &vdrz->vn_vre); lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, zio->io_offset, zio->io_size, RL_READER); use_scratch = (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == RRSS_SCRATCH_VALID); synced_offset = RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); next_offset = vdrz->vn_vre.vre_offset; /* * If we haven't resumed expanding since importing the * pool, vre_offset won't have been set yet. In * this case the next offset to be copied is the same * as what was synced. */ if (next_offset == UINT64_MAX) { next_offset = synced_offset; } } if (use_scratch) { zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" "%lld next_offset=%lld use_scratch=%u", zio, zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", (long long)zio->io_offset, (long long)synced_offset, (long long)next_offset, use_scratch); } rm = vdev_raidz_map_alloc_expanded(zio, tvd->vdev_ashift, vdrz->vd_physical_width, logical_width, vdrz->vd_nparity, synced_offset, next_offset, use_scratch); rm->rm_lr = lr; } else { rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, logical_width, vdrz->vd_nparity); } rm->rm_original_width = vdrz->vd_original_width; zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_start_write(zio, rm->rm_row[i]); } if (logical_width == vdrz->vd_physical_width) { raidz_start_skip_writes(zio); } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ void vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level RAIDZ VDEV. */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); zio_checksum_verified(zio); return (0); } if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number of such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = rc->rc_abd; ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* * Verify any empty sectors are zero filled to ensure the parity * is calculated correctly even if these non-data sectors are damaged. */ if (rr->rr_nempty && rr->rr_abd_empty != NULL) ret += vdev_draid_map_verify_empty(zio, rr); /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ vdev_raidz_generate_parity_row(rm, rr); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { zfs_dbgmsg("found error on col=%u devidx=%u off %llx", c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } abd_free(orig[c]); } return (ret); } static int vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); } return (error); } static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } if (rc->rc_force_repair) unexpected_errors++; } /* * If we read more parity disks than were used for * reconstruction, confirm that the other parity disks produced * correct data. * * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!rc->rc_allow_repair) { continue; } else if (!rc->rc_force_repair && (rc->rc_error == 0 || rc->rc_size == 0)) { continue; } /* * We do not allow self healing for Direct I/O reads. * See comment in vdev_raid_row_alloc(). */ ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " "offset=%llx", zio, c, rc->rc_devidx, (long long)rc->rc_offset); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } /* * Scrub or resilver i/o's: overwrite any shadow locations with the * good data. This ensures that if we've already copied this sector, * it will be corrected if it was damaged. This writes more than is * necessary, but since expansion is paused during scrub/resilver, at * most a single row will have a shadow location. */ if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; /* * Note: We don't want to update the repair stats * because that would incorrectly indicate that there * was bad data to repair, which we aren't sure about. * By clearing the SCAN_THREAD flag, we prevent this * from happening, despite having the REPAIR flag set. * We need to set SELF_HEAL so that this i/o can't be * bypassed by zio_vdev_io_start(). */ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL); cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; zio_nowait(cio); } } } static void raidz_restore_orig_data(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { abd_copy(rc->rc_abd, rc->rc_orig_data, rc->rc_size); rc->rc_need_orig_restore = B_FALSE; } } } } /* * During raidz_reconstruct() for expanded VDEV, we need special consideration * failure simulations. See note in raidz_reconstruct() on simulating failure * of a pre-expansion device. * * Treating logical child i as failed, return TRUE if the given column should * be treated as failed. The idea of logical children allows us to imagine * that a disk silently failed before a RAIDZ expansion (reads from this disk * succeed but return the wrong data). Since the expansion doesn't verify * checksums, the incorrect data will be moved to new locations spread among * the children (going diagonally across them). * * Higher "logical child failures" (values of `i`) indicate these * "pre-expansion failures". The first physical_width values imagine that a * current child failed; the next physical_width-1 values imagine that a * child failed before the most recent expansion; the next physical_width-2 * values imagine a child failed in the expansion before that, etc. */ static boolean_t raidz_simulate_failure(int physical_width, int original_width, int ashift, int i, raidz_col_t *rc) { uint64_t sector_id = physical_width * (rc->rc_offset >> ashift) + rc->rc_devidx; for (int w = physical_width; w >= original_width; w--) { if (i < w) { return (sector_id % w == i); } else { i -= w; } } ASSERT(!"invalid logical child id"); return (B_FALSE); } /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction */ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ int t = 0; int dead = 0; int dead_data = 0; if (dbgmsg) zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); if (rc->rc_error != 0) { dead++; if (c >= nparity) dead_data++; continue; } if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { if (raidz_simulate_failure(physical_width, original_width, zio->io_vd->vdev_top->vdev_ashift, ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( rc->rc_size, B_TRUE); abd_copy(rc->rc_orig_data, rc->rc_abd, rc->rc_size); } rc->rc_need_orig_restore = B_TRUE; dead++; if (c >= nparity) dead_data++; /* * Note: simulating failure of a * pre-expansion device can hit more * than one column, in which case we * might try to simulate more failures * than can be reconstructed, which is * also more than the size of my_tgts. * This check prevents accessing past * the end of my_tgts. The "dead > * nparity" check below will fail this * reconstruction attempt. */ if (t < VDEV_RAIDZ_MAXPARITY) { my_tgts[t++] = c; if (dbgmsg) { zfs_dbgmsg("simulating " "failure of col %u " "devidx %u", c, (int)rc->rc_devidx); } } break; } } } if (dead > nparity) { /* reconstruction not possible */ if (dbgmsg) { zfs_dbgmsg("reconstruction not possible; " "too many failures"); } raidz_restore_orig_data(rm); return (EINVAL); } if (dead_data > 0) vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); } /* Check for success */ if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) return (0); /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { /* * Note: if this is a parity column, * we don't really know if it's wrong. * We need to let * vdev_raidz_io_done_verified() check * it, and if we set rc_error, it will * think that it is a "known" error * that doesn't need to be checked * or corrected. */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { vdev_raidz_checksum_error(zio, rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); } rc->rc_need_orig_restore = B_FALSE; } } vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); if (dbgmsg) { zfs_dbgmsg("reconstruction successful " "(checksum verified)"); } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " "failed", zio); } return (ECKSUM); } /* * Iterate over all combinations of N bad vdevs and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. * * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose * 3 columns to reconstruct), we will generate the following sequence: * * STATE ACTION * 0 1 2 special case: skip since these are all parity * 0 1 3 first slot: reset to 0; middle slot: increment to 2 * 0 2 3 first slot: increment to 1 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 * 0 1 4 first: reset to 0; middle: increment to 2 * 0 2 4 first: increment to 1 * 1 2 4 first: reset to 0; middle: increment to 3 * 0 3 4 first: increment to 1 * 1 3 4 first: increment to 2 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 * 0 1 5 first: reset to 0; middle: increment to 2 * 0 2 5 first: increment to 1 * 1 2 5 first: reset to 0; middle: increment to 3 * 0 3 5 first: increment to 1 * 1 3 5 first: increment to 2 * 2 3 5 first: reset to 0; middle: increment to 4 * 0 4 5 first: increment to 1 * 1 4 5 first: increment to 2 * 2 4 5 first: increment to 3 * 3 4 5 done * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. * * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; for (int c = 0; c < rr->rr_cols; c++) { if (rr->rr_col[c].rc_error) total_errors++; } if (total_errors > nparity) return (vdev_raidz_worst_error(rr)); } for (int num_failures = 1; num_failures <= nparity; num_failures++) { int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ /* * Determine number of logical children, n. See comment * above raidz_simulate_failure(). */ int n = 0; for (int w = physical_width; w >= original_width; w--) { n += w; } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); /* Handle corner cases in combrec logic */ ltgts[-1] = -1; for (int i = 0; i < num_failures; i++) { ltgts[i] = i; } ltgts[num_failures] = n; for (;;) { int err = raidz_reconstruct(zio, ltgts, num_failures, nparity); if (err == EINVAL) { /* * Reconstruction not possible with this # * failures; try more failures. */ break; } else if (err == 0) return (0); /* Compute next targets to try */ for (int t = 0; ; t++) { ASSERT3U(t, <, num_failures); ltgts[t]++; if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruction " "failed for num_failures=" "%u; tried all " "combinations", num_failures); } break; } ASSERT3U(ltgts[t], <, n); ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) break; // found next combination /* * Otherwise, reset this tgt to the minimum, * and move on to the next tgt. */ ltgts[t] = ltgts[t - 1] + 1; ASSERT3U(ltgts[t], ==, t); } /* Increase the number of failures and keep trying. */ if (ltgts[num_failures - 1] == n) break; } } if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } void vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { for (uint64_t row = 0; row < rm->rm_nrows; row++) { raidz_row_t *rr = rm->rm_row[row]; vdev_raidz_reconstruct_row(rm, rr, t, nt); } } /* * Complete a write IO operation on a RAIDZ VDev * * Outline: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. */ static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { int normal_errors = 0; int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ normal_errors++; } if (rc->rc_shadow_error != 0) { ASSERT(rc->rc_shadow_error != ECKSUM); shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough * columns to reconstruct the data, the I/O failed. Otherwise, good * enough. Note that in the case of a shadow write (during raidz * expansion), depending on if we crash, either the normal (old) or * shadow (new) location may become the "real" version of the block, * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (normal_errors > rr->rr_firstdatacol || shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } } static void vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr) { int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * If scrubbing and a replacing/sparing child vdev determined * that not all of its children have an identical copy of the * data, then clear the error so the column is treated like * any other read and force a repair to correct the damage. */ if (rc->rc_error == ECKSUM) { ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); vdev_raidz_checksum_error(zio, rc, rc->rc_abd); rc->rc_force_repair = 1; rc->rc_error = 0; } if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; total_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * If there were data errors and the number of errors we saw was * correctable -- less than or equal to the number of parity disks read * -- reconstruct based on the missing data. */ if (data_errors != 0 && total_errors <= rr->rr_firstdatacol - parity_untried) { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rr->rr_firstdatacol); /* * Identify the data columns that reported an error. */ int n = 0; int tgts[VDEV_RAIDZ_MAXPARITY]; for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rr->rr_firstdatacol >= n); vdev_raidz_reconstruct_row(rm, rr, tgts, n); } } /* * Return the number of reads issued. */ static int vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; int nread = 0; rr->rr_missingdata = 0; rr->rr_missingparity = 0; /* * If this rows contains empty sectors which are not required * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); nread++; } return (nread); } /* * We're here because either there were too many errors to even attempt * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() * failed. In either case, there is enough bad data to prevent reconstruction. * Start checksum ereports for all children which haven't failed. */ static void vdev_raidz_io_done_unrecoverable(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; if (rc->rc_error != 0) continue; zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, &zbc); } } } void vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { if (rm->rm_phys_col) { /* * This is an aggregated read. Copy the data and status * from the aggregate abd's to the individual rows. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_error = prc->rc_error; rc->rc_tried = prc->rc_tried; rc->rc_skipped = prc->rc_skipped; if (c >= rr->rr_firstdatacol) { /* * Note: this is slightly faster * than using abd_copy_off(). */ char *physbuf = abd_to_buf( prc->rc_abd); void *physloc = physbuf + rc->rc_offset - prc->rc_offset; abd_copy_from_buf(rc->rc_abd, physloc, rc->rc_size); } } } } for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); } if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) goto done; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); } else { /* * A sequential resilver has no checksum which makes * combinatoral reconstruction impossible. This code * path is unreachable since raidz_checksum_verify() * has no checksum to verify and must succeed. */ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); /* * This isn't a typical situation -- either we got a * read error or a child silently returned bad data. * Read every block so we can try again with as much * data and parity as we can track down. If we've * already been through once before, all children will * be marked as tried so we'll proceed to combinatorial * reconstruction. */ int nread = 0; for (int i = 0; i < rm->rm_nrows; i++) { nread += vdev_raidz_read_all(zio, rm->rm_row[i]); } if (nread != 0) { /* * Normally our stage is VDEV_IO_DONE, but if * we've already called redone(), it will have * changed to VDEV_IO_START, in which case we * don't want to call redone() again. */ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) zio_vdev_io_redone(zio); return; } /* * It would be too expensive to try every possible * combination of failed sectors in every row, so * instead we try every combination of failed current or * past physical disk. This means that if the incorrect * sectors were all on Nparity disks at any point in the * past, we will find the correct data. The only known * case where this is less durable than a non-expanded * RAIDZ, is if we have a silent failure during * expansion. In that case, one block could be * partially in the old format and partially in the * new format, so we'd lost some sectors from the old * format and some from the new format. * * e.g. logical_width=4 physical_width=6 * the 15 (6+5+4) possible failed disks are: * width=6 child=0 * width=6 child=1 * width=6 child=2 * width=6 child=3 * width=6 child=4 * width=6 child=5 * width=5 child=0 * width=5 child=1 * width=5 child=2 * width=5 child=3 * width=5 child=4 * width=4 child=0 * width=4 child=1 * width=4 child=2 * width=4 child=3 * And we will try every combination of Nparity of these * failing. * * As a first pass, we can generate every combo, * and try reconstructing, ignoring any known * failures. If any row has too many known + simulated * failures, then we bail on reconstructing with this * number of simulated failures. As an improvement, * we could detect the number of whole known failures * (i.e. we have known failures on these disks for * every row; the disks never succeeded), and * subtract that from the max # failures to simulate. * We could go even further like the current * combrec code, but that doesn't seem like it * gains us very much. If we simulate a failure * that is also a known failure, that's fine. */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_raidz_io_done_unrecoverable(zio); } } } done: if (rm->rm_lr != NULL) { zfs_rangelock_exit(rm->rm_lr); rm->rm_lr = NULL; } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; /* * If we're in the middle of a RAIDZ expansion, this block may be in * the old and/or new location. For simplicity, always resilver it. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) return (B_TRUE); uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; /* Unreachable by sequential resilver. */ ASSERT3U(phys_birth, !=, TXG_UNKNOWN); if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (s + nparity >= dcols) return (B_TRUE); for (uint64_t c = 0; c < s + nparity; c++) { uint64_t devidx = (f + c) % dcols; vdev_t *cvd = vd->vdev_child[devidx]; /* * dsl_scan_need_resilver() already checked vd with * vdev_dtl_contains(). So here just check cvd with * vdev_dtl_empty(), cheaper and a good approximation. */ if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) return (B_TRUE); } return (B_FALSE); } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) remain_rs; vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); vdev_raidz_t *vdrz = raidvd->vdev_tsd; if (vdrz->vn_vre.vre_state == DSS_SCANNING) { /* * We're in the middle of expansion, in which case the * translation is in flux. Any answer we give may be wrong * by the time we return, so it isn't safe for the caller to * act on it. Therefore we say that this range isn't present * on any children. The only consumers of this are "zpool * initialize" and trimming, both of which are "best effort" * anyway. */ physical_rs->rs_start = physical_rs->rs_end = 0; remain_rs->rs_start = remain_rs->rs_end = 0; return; } uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t b_start = logical_rs->rs_start >> ashift; uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ start_row = ((b_start - tgt_col - 1) / width) + 1; uint64_t end_row = 0; if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; physical_rs->rs_start = start_row << ashift; physical_rs->rs_end = end_row << ashift; ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_rs->rs_end - logical_rs->rs_start); } static void raidz_reflow_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * Ensure there are no i/os to the range that is being committed. */ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); mutex_enter(&vre->vre_lock); uint64_t new_offset = MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); /* * We should not have committed anything that failed. */ VERIFY3U(vre->vre_failed_offset, >=, old_offset); mutex_exit(&vre->vre_lock); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, old_offset, new_offset - old_offset, RL_WRITER); /* * Update the uberblock that will be written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); vre->vre_offset_pertxg[txgoff] = 0; zfs_rangelock_exit(lr); mutex_enter(&vre->vre_lock); vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; vre->vre_bytes_copied_pertxg[txgoff] = 0; mutex_exit(&vre->vre_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); } static void raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); vdev_raidz_t *vdrz = raidvd->vdev_tsd; for (int i = 0; i < TXG_SIZE; i++) VERIFY0(vre->vre_offset_pertxg[i]); reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; re->re_logical_width = vdrz->vd_physical_width; mutex_enter(&vdrz->vd_expand_lock); avl_add(&vdrz->vd_expand_txgs, re); mutex_exit(&vdrz->vd_expand_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); /* * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS * will get written (based on vd_expand_txgs). */ vdev_config_dirty(vd); /* * Before we change vre_state, the on-disk state must reflect that we * have completed all copying, so that vdev_raidz_io_start() can use * vre_state to determine if the reflow is in progress. See also the * end of spa_raidz_expand_thread(). */ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, raidvd->vdev_ms_count << raidvd->vdev_ms_shift); vre->vre_end_time = gethrestime_sec(); vre->vre_state = DSS_FINISHED; uint64_t state = vre->vre_state; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t end_time = vre->vre_end_time; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time, tx)); spa->spa_uberblock.ub_raidz_reflow_info = 0; spa_history_log_internal(spa, "raidz vdev expansion completed", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)vd->vdev_id, (unsigned long long)vd->vdev_children); spa->spa_raidz_expand = NULL; raidvd->vdev_rz_expanding = B_FALSE; spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); spa_notify_waiters(spa); /* * While we're in syncing context take the opportunity to * setup a scrub. All the data has been sucessfully copied * but we have not validated any checksums. */ setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; if (zfs_scrub_after_expand && dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { dsl_scan_setup_sync(&setup_sync_arg, tx); } } /* * State of one copy batch. */ typedef struct raidz_reflow_arg { vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ uint64_t rra_txg; /* TXG of this batch. */ uint_t rra_ashift; /* Ashift of the vdev. */ uint32_t rra_tbd; /* Number of in-flight ZIOs. */ uint32_t rra_writes; /* Number of write ZIOs. */ zio_t *rra_zio[]; /* Write ZIO pointers. */ } raidz_reflow_arg_t; /* * Write of the new location on one child is done. Once all of them are done * we can unlock and free everything. */ static void raidz_reflow_write_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; abd_free(zio->io_abd); mutex_enter(&vre->vre_lock); if (zio->io_error != 0) { /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); } ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); vre->vre_outstanding_bytes -= zio->io_size; if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < vre->vre_failed_offset) { vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += zio->io_size; } cv_signal(&vre->vre_cv); boolean_t done = (--rra->rra_tbd == 0); mutex_exit(&vre->vre_lock); if (!done) return; spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); } /* * Read of the old location on one child is done. Once all of them are done * writes should have all the data and we can issue them. */ static void raidz_reflow_read_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; /* Reads of only one block use write ABDs. For bigger free gangs. */ if (zio->io_size > (1 << rra->rra_ashift)) abd_free(zio->io_abd); /* * If the read failed, or if it was done on a vdev that is not fully * healthy (e.g. a child that has a resilver in progress), we may not * have the correct data. Note that it's OK if the write proceeds. * It may write garbage but the location is otherwise unused and we * will retry later due to vre_failed_offset. */ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_lr->lr_length, (long long)rra->rra_txg, zio->io_error, vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), vdev_dtl_empty(zio->io_vd, DTL_MISSING)); mutex_enter(&vre->vre_lock); /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); mutex_exit(&vre->vre_lock); } if (atomic_dec_32_nv(&rra->rra_tbd) > 0) return; uint32_t writes = rra->rra_tbd = rra->rra_writes; for (uint64_t i = 0; i < writes; i++) zio_nowait(rra->rra_zio[i]); } static void raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (offset == 0) return; mutex_enter(&vre->vre_lock); ASSERT3U(vre->vre_offset, <=, offset); vre->vre_offset = offset; mutex_exit(&vre->vre_lock); if (vre->vre_offset_pertxg[txgoff] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, spa, tx); } vre->vre_offset_pertxg[txgoff] = offset; } static boolean_t vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) { for (int i = 0; i < raidz_vd->vdev_children; i++) { /* Quick check if a child is being replaced */ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) return (B_TRUE); } return (B_FALSE); } static boolean_t raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint_t ashift = vd->vdev_top->vdev_ashift; zfs_range_seg_t *rs = zfs_range_tree_first(rt); if (rt == NULL) return (B_FALSE); uint64_t offset = zfs_rs_get_start(rs, rt); ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); uint64_t size = zfs_rs_get_end(rs, rt) - offset; ASSERT3U(size, >=, 1 << ashift); ASSERT(IS_P2ALIGNED(size, 1 << ashift)); uint64_t blkid = offset >> ashift; uint_t old_children = vd->vdev_children - 1; /* * We can only progress to the point that writes will not overlap * with blocks whose progress has not yet been recorded on disk. * Since partially-copied rows are still read from the old location, * we need to stop one row before the sector-wise overlap, to prevent * row-wise overlap. * * Note that even if we are skipping over a large unallocated region, * we can't move the on-disk progress to `offset`, because concurrent * writes/allocations could still use the currently-unallocated * region. */ uint64_t ubsync_blkid = RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; uint64_t next_overwrite_blkid = ubsync_blkid + ubsync_blkid / old_children - old_children; VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); if (blkid >= next_overwrite_blkid) { raidz_reflow_record_progress(vre, next_overwrite_blkid << ashift, tx); return (B_TRUE); } size = MIN(size, raidz_expand_max_copy_bytes); size = MIN(size, (uint64_t)old_children * MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); size = MAX(size, 1 << ashift); uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); size = (uint64_t)blocks << ashift; zfs_range_tree_remove(rt, offset, size); uint_t reads = MIN(blocks, old_children); uint_t writes = MIN(blocks, vd->vdev_children); raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + sizeof (zio_t *) * writes, KM_SLEEP); rra->rra_vre = vre; rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, offset, size, RL_WRITER); rra->rra_txg = dmu_tx_get_txg(tx); rra->rra_ashift = ashift; rra->rra_tbd = reads; rra->rra_writes = writes; raidz_reflow_record_progress(vre, offset + size, tx); /* * SCL_STATE will be released when the read and write are done, * by raidz_reflow_write_done(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* check if a replacing vdev was added, if so treat it as an error */ if (vdev_raidz_expand_child_replacing(vd)) { zfs_dbgmsg("replacing vdev encountered, reflow paused at " "offset=%llu txg=%llu", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_txg); mutex_enter(&vre->vre_lock); vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); cv_signal(&vre->vre_cv); mutex_exit(&vre->vre_lock); /* drop everything we acquired */ spa_config_exit(spa, SCL_STATE, spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); return (B_TRUE); } mutex_enter(&vre->vre_lock); vre->vre_outstanding_bytes += size; mutex_exit(&vre->vre_lock); /* Allocate ABD and ZIO for each child we write. */ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; zio_t *pio = spa->spa_txg_zio[txgoff]; uint_t b = blocks / vd->vdev_children; uint_t bb = blocks % vd->vdev_children; for (uint_t i = 0; i < writes; i++) { uint_t n = b + (i < bb); abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % vd->vdev_children], ((blkid + i) / vd->vdev_children) << ashift, abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); } /* * Allocate and issue ZIO for each child we read. For reads of only * one block we can use respective writer ABDs, since they will also * have only one block. For bigger reads create gang ABDs and fill * them with respective blocks from writer ABDs. */ b = blocks / old_children; bb = blocks % old_children; for (uint_t i = 0; i < reads; i++) { uint_t n = b + (i < bb); abd_t *abd; if (n > 1) { abd = abd_alloc_gang(); for (uint_t j = 0; j < n; j++) { uint_t b = j * old_children + i; abd_t *cabd = abd_get_offset_size( rra->rra_zio[b % vd->vdev_children]->io_abd, (b / vd->vdev_children) << ashift, 1 << ashift); abd_gang_add(abd, cabd, B_TRUE); } } else { abd = rra->rra_zio[i]->io_abd; } zio_nowait(zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % old_children], ((blkid + i) / old_children) << ashift, abd, n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); } return (B_FALSE); } /* * For testing (ztest specific) */ static void raidz_expand_pause(uint_t pause_point) { while (raidz_expand_pause_point != 0 && raidz_expand_pause_point <= pause_point) delay(hz); } static void raidz_scratch_child_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); } /* * Reflow the beginning portion of the vdev into an intermediate scratch area * in memory and on disk. This operation must be persisted on disk before we * proceed to overwrite the beginning portion with the reflowed data. * * This multi-step task can fail to complete if disk errors are encountered * and we can return here after a pause (waiting for disk to become healthy). */ static void raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) { vdev_raidz_expand_t *vre = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; zio_t *pio; int error; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 1 << ashift); /* * The scratch space must be large enough to get us to the point * that one row does not overlap itself when moved. This is checked * by vdev_raidz_attach_check(). */ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); VERIFY3U(write_size, <=, read_size); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 0, logical_size, RL_WRITER); abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(read_size, B_FALSE); } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); /* * If we have already written the scratch area then we must read from * there, since new writes were redirected there while we were paused * or the original location may have been partially overwritten with * reflowed data. */ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); /* * Read from scratch space. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE * to the offset to calculate the physical offset to * write to. Passing in a negative offset makes us * access the scratch area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading scratch location", error); goto io_error_exit; } goto overwrite; } /* * Read from original location. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children - 1; i++) { ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], read_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading original location", error); io_error_exit: for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); return; } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); /* * Reflow in memory. */ uint64_t logical_sectors = logical_size >> ashift; for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { int oldchild = i % (raidvd->vdev_children - 1); uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; int newchild = i % raidvd->vdev_children; uint64_t newoff = (i / raidvd->vdev_children) << ashift; /* a single sector should not be copying over itself */ ASSERT(!(newchild == oldchild && newoff == oldoff)); abd_copy_off(abds[newchild], abds[oldchild], newoff, oldoff, 1 << ashift); } /* * Verify that we filled in everything we intended to (write_size on * each child). */ VERIFY0(logical_sectors % raidvd->vdev_children); VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, write_size); /* * Write to scratch location (boot area). */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d writing scratch location", error); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", (long long)logical_size); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); /* * Update uberblock to indicate that scratch space is valid. This is * needed because after this point, the real location may be * overwritten. If we crash, we need to get the data from the * scratch space, rather than the real location. * * Note: ub_timestamp is bumped so that vdev_uberblock_compare() * will prefer this uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); /* * Overwrite with reflow'ed data. */ overwrite: pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { /* * When we exit early here and drop the range lock, new * writes will go into the scratch area so we'll need to * read from there when we return after pausing. */ zfs_dbgmsg("reflow: error %d writing real location", error); /* * Update the uberblock that is written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, logical_size); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); /* * Update uberblock to indicate that the initial part has been * reflow'ed. This is needed because after this point (when we exit * the rangelock), we allow regular writes to this region, which will * be written to the new location only (because reflow_offset_next == * reflow_offset_synced). If we crashed and re-copied from the * scratch space, we would lose the regular writes. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); /* * Update progress. */ vre->vre_offset = logical_size; zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note - raidz_reflow_sync() will update the uberblock state to * RRSS_SCRATCH_INVALID_SYNCED_REFLOW */ raidz_reflow_sync(spa, tx); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); } /* * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work * here. No other i/o can be in progress, so we don't need the vre_rangelock. */ void vdev_raidz_reflow_copy_scratch(spa_t *spa) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); ASSERT0(logical_size % raidvd->vdev_children); uint64_t write_size = logical_size / raidvd->vdev_children; zio_t *pio; /* * Read from scratch space. */ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(write_size, B_FALSE); } pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); /* * Overwrite real location with reflow'ed data. */ pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " "to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); /* * Update uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); spa->spa_ubsync.ub_timestamp++; VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow recovery: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, spa_first_txg(spa)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset = logical_size; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note that raidz_reflow_sync() will update the uberblock once more */ raidz_reflow_sync(spa, tx); dmu_tx_commit(tx); spa_config_exit(spa, SCL_STATE, FTAG); } static boolean_t spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; return (spa->spa_raidz_expand != NULL && !spa->spa_raidz_expand->vre_waiting_for_resilver); } /* * RAIDZ expansion background thread * * Can be called multiple times if the reflow is paused */ static void spa_raidz_expand_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) vre->vre_offset = 0; else vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); /* Reflow the begining portion using the scratch area */ if (vre->vre_offset == 0) { VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_scratch_sync, vre, 0, ZFS_SPACE_CHECK_NONE)); /* if we encountered errors then pause */ if (vre->vre_offset == 0) { mutex_enter(&vre->vre_lock); vre->vre_waiting_for_resilver = B_TRUE; mutex_exit(&vre->vre_lock); return; } } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); uint64_t guid = raidvd->vdev_guid; /* Iterate over all the remaining metaslabs */ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; i < raidvd->vdev_ms_count && !zthr_iscancelled(zthr) && vre->vre_failed_offset == UINT64_MAX; i++) { metaslab_t *msp = raidvd->vdev_ms[i]; metaslab_disable(msp); mutex_enter(&msp->ms_lock); /* * The metaslab may be newly created (for the expanded * space), in which case its trees won't exist yet, * so we need to bail out early. */ if (msp->ms_new) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } VERIFY0(metaslab_load(msp)); /* * We want to copy everything except the free (allocatable) * space. Note that there may be a little bit more free * space (e.g. in ms_defer), and it's fine to copy that too. */ uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, start, shift); zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); mutex_exit(&msp->ms_lock); /* * Force the last sector of each metaslab to be copied. This * ensures that we advance the on-disk progress to the end of * this metaslab while the metaslab is disabled. Otherwise, we * could move past this metaslab without advancing the on-disk * progress, and then an allocation to this metaslab would not * be copied. */ int sectorsz = 1 << raidvd->vdev_ashift; uint64_t ms_last_offset = msp->ms_start + msp->ms_size - sectorsz; if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { zfs_range_tree_add(rt, ms_last_offset, sectorsz); } /* * When we are resuming from a paused expansion (i.e. * when importing a pool with a expansion in progress), * discard any state that we have already processed. */ if (vre->vre_offset > msp->ms_start) { zfs_range_tree_clear(rt, msp->ms_start, vre->vre_offset - msp->ms_start); } while (!zthr_iscancelled(zthr) && !zfs_range_tree_is_empty(rt) && vre->vre_failed_offset == UINT64_MAX) { /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * If requested, pause the reflow when the amount * specified by raidz_expand_max_reflow_bytes is reached * * This pause is only used during testing or debugging. */ while (raidz_expand_max_reflow_bytes != 0 && raidz_expand_max_reflow_bytes <= vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { delay(hz); } mutex_enter(&vre->vre_lock); while (vre->vre_outstanding_bytes > raidz_expand_max_copy_bytes) { cv_wait(&vre->vre_cv, &vre->vre_lock); } mutex_exit(&vre->vre_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. Theoretically, the * vdev_t that we're expanding may have changed. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); boolean_t needsync = raidz_reflow_impl(raidvd, vre, rt, tx); dmu_tx_commit(tx); if (needsync) { spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_enable(msp, B_FALSE, B_FALSE); zfs_range_tree_vacate(rt, NULL, NULL); zfs_range_tree_destroy(rt); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * The txg_wait_synced() here ensures that all reflow zio's have * completed, and vre_failed_offset has been set if necessary. It * also ensures that the progress of the last raidz_reflow_sync() is * written to disk before raidz_reflow_complete_sync() changes the * in-memory vre_state. vdev_raidz_io_start() uses vre_state to * determine if a reflow is in progress, in which case we may need to * write to both old and new locations. Therefore we can only change * vre_state once this is not necessary, which is once the on-disk * progress (in spa_ubsync) has been set past any possible writes (to * the end of the last metaslab). */ txg_wait_synced(spa->spa_dsl_pool, 0); if (!zthr_iscancelled(zthr) && vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { /* * We are not being canceled or paused, so the reflow must be * complete. In that case also mark it as completed on disk. */ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_complete_sync, spa, 0, ZFS_SPACE_CHECK_NONE)); (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); } else { /* * Wait for all copy zio's to complete and for all the * raidz_reflow_sync() synctasks to be run. */ spa_history_log_internal(spa, "reflow pause", NULL, "offset=%llu failed_offset=%lld", (long long)vre->vre_offset, (long long)vre->vre_failed_offset); mutex_enter(&vre->vre_lock); if (vre->vre_failed_offset != UINT64_MAX) { /* * Reset progress so that we will retry everything * after the point that something failed. */ vre->vre_offset = vre->vre_failed_offset; vre->vre_failed_offset = UINT64_MAX; vre->vre_waiting_for_resilver = B_TRUE; } mutex_exit(&vre->vre_lock); } } void spa_start_raidz_expansion_thread(spa_t *spa) { ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", spa_raidz_expand_thread_check, spa_raidz_expand_thread, spa, defclsyspri); } void raidz_dtl_reassessed(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (spa->spa_raidz_expand != NULL) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * we get called often from vdev_dtl_reassess() so make * sure it's our vdev and any replacing is complete */ if (vd->vdev_top->vdev_id == vre->vre_vdev_id && !vdev_raidz_expand_child_replacing(vd->vdev_top)) { mutex_enter(&vre->vre_lock); if (vre->vre_waiting_for_resilver) { vdev_dbgmsg(vd, "DTL reassessed, " "continuing raidz expansion"); vre->vre_waiting_for_resilver = B_FALSE; zthr_wakeup(spa->spa_raidz_expand_zthr); } mutex_exit(&vre->vre_lock); } } } int vdev_raidz_attach_check(vdev_t *new_child) { vdev_t *raidvd = new_child->vdev_parent; uint64_t new_children = raidvd->vdev_children; /* * We use the "boot" space as scratch space to handle overwriting the * initial part of the vdev. If it is too small, then this expansion * is not allowed. This would be very unusual (e.g. ashift > 13 and * >200 children). */ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { return (EINVAL); } return (0); } void vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) { vdev_t *new_child = arg; spa_t *spa = new_child->vdev_spa; vdev_t *raidvd = new_child->vdev_parent; vdev_raidz_t *vdrz = raidvd->vdev_tsd; ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); ASSERT3P(raidvd->vdev_top, ==, raidvd); ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, new_child); spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); vdrz->vd_physical_width++; VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; vdrz->vn_vre.vre_offset = 0; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; spa->spa_raidz_expand = &vdrz->vn_vre; zthr_wakeup(spa->spa_raidz_expand_zthr); /* * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get * written to the config. */ vdev_config_dirty(raidvd); vdrz->vn_vre.vre_start_time = gethrestime_sec(); vdrz->vn_vre.vre_end_time = 0; vdrz->vn_vre.vre_state = DSS_SCANNING; vdrz->vn_vre.vre_bytes_copied = 0; uint64_t state = vdrz->vn_vre.vre_state; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t start_time = vdrz->vn_vre.vre_start_time; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time, tx)); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); spa_history_log_internal(spa, "raidz vdev expansion started", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)raidvd->vdev_id, (unsigned long long)raidvd->vdev_children); } int vdev_raidz_load(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; int err; uint64_t state = DSS_NONE; uint64_t start_time = 0; uint64_t end_time = 0; uint64_t bytes_copied = 0; if (vd->vdev_top_zap != 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (bytes_copied), 1, &bytes_copied); if (err != 0 && err != ENOENT) return (err); } /* * If we are in the middle of expansion, vre_state should have * already been set by vdev_raidz_init(). */ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; vdrz->vn_vre.vre_start_time = start_time; vdrz->vn_vre.vre_end_time = end_time; vdrz->vn_vre.vre_bytes_copied = bytes_copied; return (0); } int spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (vre == NULL) { /* no removal in progress; find most recent completed */ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; if (vd->vdev_ops == &vdev_raidz_ops) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vdrz->vn_vre.vre_end_time != 0 && (vre == NULL || vdrz->vn_vre.vre_end_time > vre->vre_end_time)) { vre = &vdrz->vn_vre; } } } } if (vre == NULL) { return (SET_ERROR(ENOENT)); } pres->pres_state = vre->vre_state; pres->pres_expanding_vdev = vre->vre_vdev_id; vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); pres->pres_to_reflow = vd->vdev_stat.vs_alloc; mutex_enter(&vre->vre_lock); pres->pres_reflowed = vre->vre_bytes_copied; for (int i = 0; i < TXG_SIZE; i++) pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; mutex_exit(&vre->vre_lock); pres->pres_start_time = vre->vre_start_time; pres->pres_end_time = vre->vre_end_time; pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; return (0); } /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error != 0) return (SET_ERROR(EINVAL)); uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(EINVAL)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vn_vre.vre_vdev_id = -1; vdrz->vn_vre.vre_offset = UINT64_MAX; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; /* note, the ID does not exist when creating a pool */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &vdrz->vn_vre.vre_vdev_id); boolean_t reflow_in_progress = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); if (reflow_in_progress) { spa->spa_raidz_expand = &vdrz->vn_vre; vdrz->vn_vre.vre_state = DSS_SCANNING; } vdrz->vd_original_width = children; uint64_t *txgs; unsigned int txgs_size = 0; error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, &txgs, &txgs_size); if (error == 0) { for (int i = 0; i < txgs_size; i++) { reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = txgs[txgs_size - i - 1]; re->re_logical_width = vdrz->vd_physical_width - i; if (reflow_in_progress) re->re_logical_width--; avl_add(&vdrz->vd_expand_txgs, re); } vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; } if (reflow_in_progress) { vdrz->vd_original_width--; zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", children, txgs_size); } *tsd = vdrz; return (0); } static void vdev_raidz_fini(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) vd->vdev_spa->spa_raidz_expand = NULL; reflow_node_t *re; void *cookie = NULL; avl_tree_t *tree = &vdrz->vd_expand_txgs; while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) kmem_free(re, sizeof (*re)); avl_destroy(&vdrz->vd_expand_txgs); mutex_destroy(&vdrz->vd_expand_lock); mutex_destroy(&vdrz->vn_vre.vre_lock); cv_destroy(&vdrz->vn_vre.vre_cv); zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); kmem_free(vdrz, sizeof (*vdrz)); } /* * Add RAIDZ specific fields to the config nvlist. */ static void vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); vdev_raidz_t *vdrz = vd->vdev_tsd; /* * Make sure someone hasn't managed to sneak a fancy new vdev * into a crufty old storage pool. */ ASSERT(vdrz->vd_nparity == 1 || (vdrz->vd_nparity <= 2 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || (vdrz->vd_nparity <= 3 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add these even on storage pools where they * aren't strictly required -- older software will just ignore * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); if (vdrz->vn_vre.vre_state == DSS_SCANNING) { fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, KM_SLEEP); uint64_t i = 0; for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { txgs[i++] = re->re_txg; } fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, txgs, count); kmem_free(txgs, sizeof (uint64_t) * count); } mutex_exit(&vdrz->vd_expand_lock); } static uint64_t vdev_raidz_nparity(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; return (vdrz->vd_nparity); } static uint64_t vdev_raidz_ndisks(vdev_t *vd) { return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { .vdev_op_init = vdev_raidz_init, .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, .vdev_op_min_asize = vdev_raidz_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, .vdev_op_need_resilver = vdev_raidz_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_raidz_config_generate, .vdev_op_nparity = vdev_raidz_nparity, .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, "For testing, pause RAIDZ expansion after reflowing this many bytes"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, "Max amount of concurrent i/o for RAIDZ expansion"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, "For expanded RAIDZ, aggregate reads that have more rows than this"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index e1819448a98a..1970c5425854 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1,2571 +1,2571 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the necessary logic to remove vdevs from a * storage pool. Currently, the only devices that can be removed * are log, cache, and spare devices; and top level vdevs from a pool * w/o raidz or mirrors. (Note that members of a mirror can be removed * by the detach operation.) * * Log vdevs are removed by evacuating them and then turning the vdev * into a hole vdev while holding spa config locks. * * Top level vdevs are removed and converted into an indirect vdev via * a multi-step process: * * - Disable allocations from this device (spa_vdev_remove_top). * * - From a new thread (spa_vdev_remove_thread), copy data from * the removing vdev to a different vdev. The copy happens in open * context (spa_vdev_copy_impl) and issues a sync task * (vdev_mapping_sync) so the sync thread can update the partial * indirect mappings in core and on disk. * * - If a free happens during a removal, it is freed from the * removing vdev, and if it has already been copied, from the new * location as well (free_from_removing_vdev). * * - After the removal is completed, the copy thread converts the vdev * into an indirect vdev (vdev_remove_complete) before instructing * the sync thread to destroy the space maps and finish the removal * (spa_finish_removal). */ typedef struct vdev_copy_arg { metaslab_t *vca_msp; uint64_t vca_outstanding_bytes; uint64_t vca_read_error_bytes; uint64_t vca_write_error_bytes; kcondvar_t vca_cv; kmutex_t vca_lock; } vdev_copy_arg_t; /* * The maximum amount of memory we can use for outstanding i/o while * doing a device removal. This determines how much i/o we can have * in flight concurrently. */ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. * * See also the accessor function spa_remove_max_segment(). */ uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device * encounters hard IO error during the removal process the removal will * not be cancelled. This can result in a normally recoverable block * becoming permanently damaged and is not recommended. */ static int zfs_removal_ignore_errors = 0; /* * Allow a remap segment to span free chunks of at most this size. The main * impact of a larger span is that we will read and write larger, more * contiguous chunks, with more "unnecessary" data -- trading off bandwidth * for iops. The value here was chosen to align with * zfs_vdev_read_gap_limit, which is a similar concept when doing regular * reads (but there's no reason it has to be the same). * * Additionally, a higher span will have the following relatively minor * effects: * - the mapping will be smaller, since one entry can cover more allocated * segments * - more of the fragmentation in the removing device will be preserved * - we'll do larger allocations, which may fail and fall back on smaller * allocations */ uint_t vdev_removal_max_span = 32 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg); static int spa_vdev_remove_cancel_impl(spa_t *spa); static void spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) { VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys, tx)); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid = fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void vdev_activate(vdev_t *vd) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; uint64_t vdev_space = spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; ASSERT(!vd->vdev_islog); ASSERT(vd->vdev_noalloc); metaslab_group_activate(mg); metaslab_group_activate(vd->vdev_log_mg); ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); spa->spa_nonallocating_dspace -= vdev_space; vd->vdev_noalloc = B_FALSE; } static int vdev_passivate(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; int error; ASSERT(!vd->vdev_noalloc); vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *normal = spa_normal_class(spa); if (mg->mg_class == normal) { /* * We must check that this is not the only allocating device in * the pool before passivating, otherwise we will not be able * to make progress because we can't allocate from any vdevs. */ boolean_t last = B_TRUE; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; if (cvd == vd || cvd->vdev_ops == &vdev_indirect_ops) continue; metaslab_class_t *mc = cvd->vdev_mg->mg_class; if (mc != normal) continue; if (!cvd->vdev_noalloc) { last = B_FALSE; break; } } if (last) return (SET_ERROR(EINVAL)); } metaslab_group_passivate(mg); ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * We must ensure that no "stubby" log blocks are allocated * on the device to be removed. These blocks could be * written at any time, including while we are in the middle * of copying them. */ error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); ASSERT(!vd->vdev_islog); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); return (error); } spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; vd->vdev_noalloc = B_TRUE; return (0); } /* * Turn off allocations for a top-level device from the pool. * * Turning off allocations for a top-level device can take a significant * amount of time. As a result we use the spa_vdev_config_[enter/exit] * functions which allow us to grab and release the spa_config_lock while * still holding the namespace lock. During each step the configuration * is synced out. */ int spa_vdev_noalloc(spa_t *spa, uint64_t guid) { vdev_t *vd; uint64_t txg; int error = 0; ASSERT(!MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) error = SET_ERROR(ENOENT); else if (vd->vdev_mg == NULL) error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); else if (!vd->vdev_noalloc) error = vdev_passivate(vd, &txg); if (error == 0) { vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); } error = spa_vdev_exit(spa, NULL, txg, error); return (error); } int spa_vdev_alloc(spa_t *spa, uint64_t guid) { vdev_t *vd; uint64_t txg; int error = 0; ASSERT(!MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) error = SET_ERROR(ENOENT); else if (vd->vdev_mg == NULL) error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); else if (!vd->vdev_removing) vdev_activate(vd); if (error == 0) { vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); } (void) spa_vdev_exit(spa, NULL, txg, error); return (error); } static void spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev, count - 1); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } static spa_vdev_removal_t * spa_vdev_removal_create(vdev_t *vd) { spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } return (svr); } void spa_vdev_removal_destroy(spa_vdev_removal_t *svr) { for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); ASSERT0(svr->svr_max_offset_to_sync[i]); zfs_range_tree_destroy(svr->svr_frees[i]); list_destroy(&svr->svr_new_segments[i]); } zfs_range_tree_destroy(svr->svr_allocd_segs); mutex_destroy(&svr->svr_lock); cv_destroy(&svr->svr_cv); kmem_free(svr, sizeof (*svr)); } /* * This is called as a synctask in the txg in which we will mark this vdev * as removing (in the config stored in the MOS). * * It begins the evacuation of a toplevel vdev by: * - initializing the spa_removing_phys which tracks this removal * - computing the amount of space to remove for accounting purposes * - dirtying all dbufs in the spa_config_object * - creating the spa_vdev_removal * - starting the spa_vdev_remove_thread */ static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { /* * By activating the OBSOLETE_COUNTS feature, we prevent * the pool from being downgraded and ensure that the * refcounts are precise. */ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); uint64_t one = 1; VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, &one, tx)); boolean_t are_precise __maybe_unused; ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise)); ASSERT3B(are_precise, ==, B_TRUE); } vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); vd->vdev_indirect_mapping = vdev_indirect_mapping_open(mos, vic->vic_mapping_object); vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); vd->vdev_indirect_births = vdev_indirect_births_open(mos, vic->vic_births_object); spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; spa->spa_removing_phys.sr_start_time = gethrestime_sec(); spa->spa_removing_phys.sr_end_time = 0; spa->spa_removing_phys.sr_state = DSS_SCANNING; spa->spa_removing_phys.sr_to_copy = 0; spa->spa_removing_phys.sr_copied = 0; /* * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because * there may be space in the defer tree, which is free, but still * counted in vs_alloc. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *ms = vd->vdev_ms[i]; if (ms->ms_sm == NULL) continue; spa->spa_removing_phys.sr_to_copy += metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to * be copied. */ spa->spa_removing_phys.sr_to_copy -= zfs_range_tree_space(ms->ms_freeing); ASSERT0(zfs_range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) ASSERT0(zfs_range_tree_space(ms->ms_allocating[t])); } /* * Sync tasks are called before metaslab_sync(), so there should * be no already-synced metaslabs in the TXG_CLEAN list. */ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); spa_sync_removing_state(spa, tx); /* * All blocks that we need to read the most recent mapping must be * stored on concrete vdevs. Therefore, we must dirty anything that * is read before spa_remove_init(). Specifically, the * spa_config_object. (Note that although we already modified the * spa_config_object in spa_sync_removing_state, that may not have * modified all blocks of the object.) */ dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { dmu_buf_t *dbuf; VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, offset, FTAG, &dbuf, 0)); dmu_buf_will_dirty(dbuf, tx); offset += dbuf->db_size; dmu_buf_rele(dbuf, FTAG); } /* * Now that we've allocated the im_object, dirty the vdev to ensure * that the object gets written to the config on disk. */ vdev_config_dirty(vd); zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu " "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)vic->vic_mapping_object); spa_history_log_internal(spa, "vdev remove started", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* * Setting spa_vdev_removal causes subsequent frees to call * free_from_removing_vdev(). Note that we don't need any locking * because we are the sync thread, and metaslab_free_impl() is only * called from syncing context (potentially from a zio taskq thread, * but in any case only when there are outstanding free i/os, which * there are not). */ ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * When we are opening a pool, we must read the mapping for each * indirect vdev in order from most recently removed to least * recently removed. We do this because the blocks for the mapping * of older indirect vdevs may be stored on more recently removed vdevs. * In order to read each indirect mapping object, we must have * initialized all more recently removed vdevs. */ int spa_remove_init(spa_t *spa) { int error; error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys); if (error == ENOENT) { spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } else if (error != 0) { return (error); } if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { /* * We are currently removing a vdev. Create and * initialize a spa_vdev_removal_t from the bonus * buffer of the removing vdevs vdev_im_object, and * initialize its partial mapping. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); if (vd == NULL) { spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != UINT64_MAX) { vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); indirect_vdev_id = vic->vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_STATE, FTAG); /* * Now that we've loaded all the indirect mappings, we can allow * reads from other blocks (e.g. via predictive prefetch). */ spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } void spa_restart_removal(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; /* * In general when this function is called there is no * removal thread running. The only scenario where this * is not true is during spa_import() where this function * is called twice [once from spa_import_impl() and * spa_async_resume()]. Thus, in the scenario where we * import a pool that has an ongoing removal we don't * want to spawn a second thread. */ if (svr->svr_thread != NULL) return; if (!spa_writeable(spa)) return; zfs_dbgmsg("restarting removal of %llu", (u_longlong_t)svr->svr_vdev_id); svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * Process freeing from a device which is in the middle of being removed. * We must handle this carefully so that we attempt to copy freed data, * and we correctly free already-copied data. */ void free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t txg = spa_syncing_txg(spa); uint64_t max_offset_yet = 0; ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); mutex_enter(&svr->svr_lock); /* * Remove the segment from the removing vdev's spacemap. This * ensures that we will not attempt to copy this space (if the * removal thread has not yet visited it), and also ensures * that we know what is actually allocated on the new vdevs * (needed if we cancel the removal). * * Note: we must do the metaslab_free_concrete() with the svr_lock * held, so that the remove_thread can not load this metaslab and then * visit this offset between the time that we metaslab_free_concrete() * and when we check to see if it has been visited. * * Note: The checkpoint flag is set to false as having/taking * a checkpoint and removing a device can't happen at the same * time. */ ASSERT(!spa_has_checkpoint(spa)); metaslab_free_concrete(vd, offset, size, B_FALSE); uint64_t synced_size = 0; uint64_t synced_offset = 0; uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); if (offset < max_offset_synced) { /* * The mapping for this offset is already on disk. * Free from the new location. * * Note that we use svr_max_synced_offset because it is * updated atomically with respect to the in-core mapping. * By contrast, vim_max_offset is not. * * This block may be split between a synced entry and an * in-flight or unvisited entry. Only process the synced * portion of it here. */ synced_size = MIN(size, max_offset_synced - offset); synced_offset = offset; ASSERT3U(max_offset_yet, <=, max_offset_synced); max_offset_yet = max_offset_synced; DTRACE_PROBE3(remove__free__synced, spa_t *, spa, uint64_t, offset, uint64_t, synced_size); size -= synced_size; offset += synced_size; } /* * Look at all in-flight txgs starting from the currently syncing one * and see if a section of this free is being copied. By starting from * this txg and iterating forward, we might find that this region * was copied in two different txgs and handle it appropriately. */ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { int txgoff = (txg + i) & TXG_MASK; if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { /* * The mapping for this offset is in flight, and * will be synced in txg+i. */ uint64_t inflight_size = MIN(size, svr->svr_max_offset_to_sync[txgoff] - offset); DTRACE_PROBE4(remove__free__inflight, spa_t *, spa, uint64_t, offset, uint64_t, inflight_size, uint64_t, txg + i); /* * We copy data in order of increasing offset. * Therefore the max_offset_to_sync[] must increase * (or be zero, indicating that nothing is being * copied in that txg). */ if (svr->svr_max_offset_to_sync[txgoff] != 0) { ASSERT3U(svr->svr_max_offset_to_sync[txgoff], >=, max_offset_yet); max_offset_yet = svr->svr_max_offset_to_sync[txgoff]; } /* * We've already committed to copying this segment: * we have allocated space elsewhere in the pool for * it and have an IO outstanding to copy the data. We * cannot free the space before the copy has * completed, or else the copy IO might overwrite any * new data. To free that space, we record the * segment in the appropriate svr_frees tree and free * the mapped space later, in the txg where we have * completed the copy and synced the mapping (see * vdev_mapping_sync). */ zfs_range_tree_add(svr->svr_frees[txgoff], offset, inflight_size); size -= inflight_size; offset += inflight_size; /* * This space is already accounted for as being * done, because it is being copied in txg+i. * However, if i!=0, then it is being copied in * a future txg. If we crash after this txg * syncs but before txg+i syncs, then the space * will be free. Therefore we must account * for the space being done in *this* txg * (when it is freed) rather than the future txg * (when it will be copied). */ ASSERT3U(svr->svr_bytes_done[txgoff], >=, inflight_size); svr->svr_bytes_done[txgoff] -= inflight_size; svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; } } ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); if (size > 0) { /* * The copy thread has not yet visited this offset. Ensure * that it doesn't. */ DTRACE_PROBE3(remove__free__unvisited, spa_t *, spa, uint64_t, offset, uint64_t, size); if (svr->svr_allocd_segs != NULL) zfs_range_tree_clear(svr->svr_allocd_segs, offset, size); /* * Since we now do not need to copy this data, for * accounting purposes we have done our job and can count * it as completed. */ svr->svr_bytes_done[txg & TXG_MASK] += size; } mutex_exit(&svr->svr_lock); /* * Now that we have dropped svr_lock, process the synced portion * of this free. */ if (synced_size > 0) { vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); /* * Note: this can only be called from syncing context, * and the vdev_indirect_mapping is only changed from the * sync thread, so we don't need svr_lock while doing * metaslab_free_impl_cb. */ boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, metaslab_free_impl_cb, &checkpoint); } } /* * Stop an active removal and update the spa_removing phys. */ static void spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); /* Ensure the removal thread has completed before we free the svr. */ spa_vdev_remove_suspend(spa); ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != -1) { vdev_t *pvd; pvd = vdev_lookup_top(spa, srp->sr_prev_indirect_vdev); ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); } vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; srp->sr_prev_indirect_vdev = vd->vdev_id; } spa->spa_removing_phys.sr_state = state; spa->spa_removing_phys.sr_end_time = gethrestime_sec(); spa->spa_vdev_removal = NULL; spa_vdev_removal_destroy(svr); spa_sync_removing_state(spa, tx); spa_notify_waiters(spa); vdev_config_dirty(spa->spa_root_vdev); } static void free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_mark_obsolete(vd, offset, size); boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } /* * On behalf of the removal thread, syncs an incremental bit more of * the indirect mapping to disk and updates the in-memory mapping. * Called as a sync task in every txg that the removal thread makes progress. */ static void vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT(vic->vic_mapping_object != 0); ASSERT3U(txg, ==, spa_syncing_txg(spa)); vdev_indirect_mapping_add_entries(vim, &svr->svr_new_segments[txg & TXG_MASK], tx); vdev_indirect_births_add_entry(vd->vdev_indirect_births, vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); /* * Free the copied data for anything that was freed while the * mapping entries were in flight. */ mutex_enter(&svr->svr_lock); zfs_range_tree_vacate(svr->svr_frees[txg & TXG_MASK], free_mapped_segment_cb, vd); ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, vdev_indirect_mapping_max_offset(vim)); svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; mutex_exit(&svr->svr_lock); spa_sync_removing_state(spa, tx); } typedef struct vdev_copy_segment_arg { spa_t *vcsa_spa; dva_t *vcsa_dest_dva; uint64_t vcsa_txg; zfs_range_tree_t *vcsa_obsolete_segs; } vdev_copy_segment_arg_t; static void unalloc_seg(void *arg, uint64_t start, uint64_t size) { vdev_copy_segment_arg_t *vcsa = arg; spa_t *spa = vcsa->vcsa_spa; blkptr_t bp = { { { {0} } } }; BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(&bp, size); BP_SET_PSIZE(&bp, size); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(&bp, DMU_OT_NONE); BP_SET_LEVEL(&bp, 0); BP_SET_DEDUP(&bp, 0); BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); DVA_SET_OFFSET(&bp.blk_dva[0], DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); DVA_SET_ASIZE(&bp.blk_dva[0], size); zio_free(spa, vcsa->vcsa_txg, &bp); } /* * All reads and writes associated with a call to spa_vdev_copy_segment() * are done. */ static void spa_vdev_copy_segment_done(zio_t *zio) { vdev_copy_segment_arg_t *vcsa = zio->io_private; zfs_range_tree_vacate(vcsa->vcsa_obsolete_segs, unalloc_seg, vcsa); zfs_range_tree_destroy(vcsa->vcsa_obsolete_segs); kmem_free(vcsa, sizeof (*vcsa)); spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); } /* * The write of the new location is done. */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; if (zio->io_error != 0) vca->vca_write_error_bytes += zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); } /* * The read of the old location is done. The parent zio is the write to * the new location. Allow it to start. */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; if (zio->io_error != 0) { mutex_enter(&vca->vca_lock); vca->vca_read_error_bytes += zio->io_size; mutex_exit(&vca->vca_lock); } zio_nowait(zio_unique_parent(zio)); } /* * If the old and new vdevs are mirrors, we will read both sides of the old * mirror, and write each copy to the corresponding side of the new mirror. * If the old and new vdevs have a different number of children, we will do * this as best as possible. Since we aren't verifying checksums, this * ensures that as long as there's a good copy of the data, we'll have a * good copy after the removal, even if there's silent damage to one side * of the mirror. If we're removing a mirror that has some silent damage, * we'll have exactly the same damage in the new location (assuming that * the new location is also a mirror). * * We accomplish this by creating a tree of zio_t's, with as many writes as * there are "children" of the new vdev (a non-redundant vdev counts as one * child, a 2-way mirror has 2 children, etc). Each write has an associated * read from a child of the old vdev. Typically there will be the same * number of children of the old and new vdevs. However, if there are more * children of the new vdev, some child(ren) of the old vdev will be issued * multiple reads. If there are more children of the old vdev, some copies * will be dropped. * * For example, the tree of zio_t's for a 2-way mirror is: * * null * / \ * write(new vdev, child 0) write(new vdev, child 1) * | | * read(old vdev, child 0) read(old vdev, child 1) * * Child zio's complete before their parents complete. However, zio's * created with zio_vdev_child_io() may be issued before their children * complete. In this case we need to make sure that the children (reads) * complete before the parents (writes) are *issued*. We do this by not * calling zio_nowait() on each write until its corresponding read has * completed. * * The spa_config_lock must be held while zio's created by * zio_vdev_child_io() are in progress, to ensure that the vdev tree does * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" * zio is needed to release the spa_config_lock after all the reads and * writes complete. (Note that we can't grab the config lock for each read, * because it is not reentrant - we could deadlock with a thread waiting * for a write lock.) */ static void spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, vdev_t *source_vd, uint64_t source_offset, vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) { ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); /* * If the destination child in unwritable then there is no point * in issuing the source reads which cannot be written. */ if (!vdev_writeable(dest_child_vd)) return; mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes += size; mutex_exit(&vca->vca_lock); abd_t *abd = abd_alloc_for_io(size, B_FALSE); vdev_t *source_child_vd = NULL; if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { /* * Source and dest are both mirrors. Copy from the same * child id as we are copying to (wrapping around if there * are more dest children than source children). If the * preferred source child is unreadable select another. */ for (int i = 0; i < source_vd->vdev_children; i++) { source_child_vd = source_vd->vdev_child[ (dest_id + i) % source_vd->vdev_children]; if (vdev_readable(source_child_vd)) break; } } else { source_child_vd = source_vd; } /* * There should always be at least one readable source child or * the pool would be in a suspended state. Somehow selecting an * unreadable child would result in IO errors, the removal process * being cancelled, and the pool reverting to its pre-removal state. */ ASSERT3P(source_child_vd, !=, NULL); zio_t *write_zio = zio_vdev_child_io(nzio, NULL, dest_child_vd, dest_offset, abd, size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_write_done, vca); zio_nowait(zio_vdev_child_io(write_zio, NULL, source_child_vd, source_offset, abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_read_done, vca)); } /* * Allocate a new location for this segment, and create the zio_t's to * read from the old location and write to the new location. */ static int spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = zfs_range_tree_min(segs); ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = zfs_range_tree_span(segs); if (zfs_range_tree_span(segs) > maxalloc) { /* * We can't allocate all the segments. Prefer to end * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_max_t search; + zfs_range_seg_max_t search; zfs_btree_index_t where; zfs_rs_set_start(&search, segs, start + maxalloc); zfs_rs_set_end(&search, segs, start + maxalloc); (void) zfs_btree_find(&segs->rt_root, &search, &where); zfs_range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { size = zfs_rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. * I.e. the first segment is larger than maxalloc, * so we must split it. */ size = maxalloc; } } ASSERT3U(size, <=, maxalloc); ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; if (mc->mc_groups == 0) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); } if (error != 0) return (error); /* * Determine the ranges that are not actually needed. Offsets are * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); zfs_btree_index_t where; zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); ASSERT3U(zfs_rs_get_start(rs, segs), ==, start); uint64_t prev_seg_end = zfs_rs_get_end(rs, segs); while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { if (zfs_rs_get_start(rs, segs) >= start + size) { break; } else { zfs_range_tree_add(obsolete_segs, prev_seg_end - start, zfs_rs_get_start(rs, segs) - prev_seg_end); } prev_seg_end = zfs_rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); zfs_range_tree_clear(segs, start, size); /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. We * prevent padding by ensuring that all devices in the pool have the * same ashift, and the allocation size is a multiple of the ashift. */ VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { entry->vime_obsolete_count = zfs_range_tree_space(obsolete_segs); } vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; vcsa->vcsa_obsolete_segs = obsolete_segs; vcsa->vcsa_spa = spa; vcsa->vcsa_txg = txg; /* * See comment before spa_vdev_copy_one_child(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, spa_vdev_copy_segment_done, vcsa, 0); vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); if (dest_vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < dest_vd->vdev_children; i++) { vdev_t *child = dest_vd->vdev_child[i]; spa_vdev_copy_one_child(vca, nzio, vd, start, child, DVA_GET_OFFSET(&dst), i, size); } } else { spa_vdev_copy_one_child(vca, nzio, vd, start, dest_vd, DVA_GET_OFFSET(&dst), -1, size); } zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); vdev_dirty(vd, 0, NULL, txg); return (0); } /* * Complete the removal of a toplevel vdev. This is called as a * synctask in the same txg that we will sync out the new config (to the * MOS object) which indicates that this vdev is indirect. */ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); } ASSERT3U(spa->spa_removing_phys.sr_copied, ==, spa->spa_removing_phys.sr_to_copy); vdev_destroy_spacemaps(vd, tx); /* destroy leaf zaps, if any */ ASSERT3P(svr->svr_zaplist, !=, NULL); for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); pair != NULL; pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); } fnvlist_free(svr->svr_zaplist); spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); /* vd->vdev_path is not available here */ spa_history_log_internal(spa, "vdev remove completed", tx, "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id); } static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; (void) snprintf(zkey, sizeof (zkey), "%s-%llu", VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); } for (uint64_t id = 0; id < vd->vdev_children; id++) { vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); } } static void vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) { vdev_t *ivd; dmu_tx_t *tx; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; /* * First, build a list of leaf zaps to be destroyed. * This is passed to the sync context thread, * which does the actual unlinking. */ svr->svr_zaplist = fnvlist_alloc(); vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); mutex_enter(&svr->svr_lock); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* After this, we can not use svr. */ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, tx); dmu_tx_commit(tx); } /* * Complete the removal of a toplevel vdev. This is called in open * context by the removal thread after we have copied all vdev's data. */ static void vdev_remove_complete(spa_t *spa) { uint64_t txg; /* * Wait for any deferred frees to be synced before we call * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); vdev_rebuild_stop_wait(vd); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); uint64_t vdev_space = spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)txg); ASSERT3U(0, !=, vdev_space); ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); /* the vdev is no longer part of the dspace */ spa->spa_nonallocating_dspace -= vdev_space; /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); vdev_remove_replace_with_indirect(vd, txg); /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. * * Note that we hold on to the vdev_t that has been replaced. Since * it isn't part of the vdev tree any longer, it can't be concurrently * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); /* * Top ZAP should have been transferred to the indirect vdev in * vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_top_zap); /* * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_leaf_zap); txg = spa_vdev_enter(spa); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Request to update the config and the config cachefile. */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); if (ev != NULL) spa_event_post(ev); } /* * Evacuates a segment of size at most max_alloc from the vdev * via repeated calls to spa_vdev_copy_segment. If an allocation * fails, the pool is probably too fragmented to handle such a * large size, so decrease max_alloc so that the caller will not try * this size again this txg. */ static void spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = dmu_tx_pool(tx)->dp_spa; mutex_enter(&svr->svr_lock); /* * Determine how big of a chunk to copy. We can allocate up * to max_alloc bytes, and we can span up to vdev_removal_max_span * bytes of unallocated space at a time. "segs" will track the * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); for (;;) { zfs_range_tree_t *rt = svr->svr_allocd_segs; zfs_range_seg_t *rs = zfs_range_tree_first(rt); if (rs == NULL) break; uint64_t seg_length; if (zfs_range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ seg_length = MIN(zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt), *max_alloc); } else { if (zfs_rs_get_start(rs, rt) - zfs_range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; } else if (zfs_rs_get_end(rs, rt) - zfs_range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past * max_alloc. Rather than splitting this * segment, leave it for the next mapping. */ break; } else { seg_length = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); } } zfs_range_tree_add(segs, zfs_rs_get_start(rs, rt), seg_length); zfs_range_tree_remove(svr->svr_allocd_segs, zfs_rs_get_start(rs, rt), seg_length); } if (zfs_range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); zfs_range_tree_destroy(segs); return; } if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, svr, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = zfs_range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ svr->svr_bytes_done[txg & TXG_MASK] += zfs_range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); uint64_t thismax = SPA_MAXBLOCKSIZE; while (!zfs_range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* * Cut our segment in half, and don't try this * segment size again this txg. Note that the * allocation size must be aligned to the highest * ashift in the pool, so that the allocation will * not be padded out to a multiple of the ashift, * which could cause us to think that this mapping * is larger than we intended. */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); uint64_t attempted = MIN(zfs_range_tree_span(segs), thismax); thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); /* * The minimum-size allocation can not fail. */ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); /* * We've performed an allocation, so reset the * alloc trace list. */ metaslab_trace_fini(&zal); metaslab_trace_init(&zal); } } metaslab_trace_fini(&zal); zfs_range_tree_destroy(segs); } /* * The size of each removal mapping is limited by the tunable * zfs_remove_max_segment, but we must adjust this to be a multiple of the * pool's ashift, so that we don't try to split individual sectors regardless * of the tunable value. (Note that device removal requires that all devices * have the same ashift, so there's no difference between spa_min_ashift and * spa_max_ashift.) The raw tunable should not be used elsewhere. */ uint64_t spa_remove_max_segment(spa_t *spa) { return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); } /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. * For each contiguous segment of allocated space (capping the segment * size at SPA_MAXBLOCKSIZE), we: * - Allocate space for it on another vdev. * - Create a new mapping from the old location to the new location * (as a record in svr_new_segments). * - Initiate a physical read zio to get the data off the removing disk. * - In the read zio's done callback, initiate a physical write zio to * write it to the new vdev. * Note that all of this will take effect when a particular TXG syncs. * The sync thread ensures that all the phys reads and writes for the syncing * TXG have completed (see spa_txg_zio) and writes the new mappings to disk * (see vdev_mapping_sync()). */ static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg) { spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); vca.vca_outstanding_bytes = 0; vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; mutex_enter(&svr->svr_lock); /* * Start from vim_max_offset so we pick up where we left off * if we are restarting the removal after opening the pool. */ uint64_t msi; for (msi = start_offset >> vd->vdev_ms_shift; msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT3U(msi, <=, vd->vdev_ms_count); ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(zfs_range_tree_space(msp->ms_allocating[i])); } /* * If the metaslab has ever been allocated from (ms_sm!=NULL), * read the allocated segments from the space map object * into svr_allocd_segs. Since we do this while holding * svr_lock and ms_sync_lock, concurrent frees (which * would have modified the space map) will wait for us * to finish loading the spacemap, and then take the * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_add, svr->svr_allocd_segs); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_remove, svr->svr_allocd_segs); zfs_range_tree_walk(msp->ms_freeing, zfs_range_tree_remove, svr->svr_allocd_segs); /* * When we are resuming from a paused removal (i.e. * when importing a pool with a removal in progress), * discard any state that we have already processed. */ zfs_range_tree_clear(svr->svr_allocd_segs, 0, start_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", (u_longlong_t)zfs_btree_numnodes( &svr->svr_allocd_segs->rt_root), (u_longlong_t)msp->ms_id); while (!svr->svr_thread_exit && !zfs_range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * This delay will pause the removal around the point * specified by zfs_removal_suspend_progress. We do this * solely from the test suite or during debugging. */ while (zfs_removal_suspend_progress && !svr->svr_thread_exit) delay(hz); mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { cv_wait(&vca.vca_cv, &vca.vca_lock); } mutex_exit(&vca.vca_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. The vdev_t * that we're removing may have changed, e.g. due * to a vdev_attach or vdev_detach. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); } mutex_enter(&vca.vca_lock); if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { svr->svr_thread_exit = B_TRUE; } mutex_exit(&vca.vca_lock); } mutex_exit(&svr->svr_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Wait for all copies to finish before cleaning up the vca. */ txg_wait_synced(spa->spa_dsl_pool, 0); ASSERT0(vca.vca_outstanding_bytes); mutex_destroy(&vca.vca_lock); cv_destroy(&vca.vca_cv); if (svr->svr_thread_exit) { mutex_enter(&svr->svr_lock); zfs_range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* * During the removal process an unrecoverable read or write * error was encountered. The removal process must be * cancelled or this damage may become permanent. */ if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { zfs_dbgmsg("canceling removal due to IO errors: " "[read_error_bytes=%llu] [write_error_bytes=%llu]", (u_longlong_t)vca.vca_read_error_bytes, (u_longlong_t)vca.vca_write_error_bytes); spa_vdev_remove_cancel_impl(spa); } } else { ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } thread_exit(); } void spa_vdev_remove_suspend(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; mutex_enter(&svr->svr_lock); svr->svr_thread_exit = B_TRUE; while (svr->svr_thread != NULL) cv_wait(&svr->svr_cv, &svr->svr_lock); svr->svr_thread_exit = B_FALSE; mutex_exit(&svr->svr_lock); } /* * Return true if the "allocating" property has been set to "off" */ static boolean_t vdev_prop_allocating_off(vdev_t *vd) { uint64_t objid = vd->vdev_top_zap; uint64_t allocating = 1; /* no vdev property object => no props */ if (objid != 0) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; mutex_enter(&spa->spa_props_lock); (void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t), 1, &allocating); mutex_exit(&spa->spa_props_lock); } return (allocating == 0); } static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (0); } /* * Cancel a removal by freeing all entries from the partial mapping * and marking the vdev as no longer being removing. */ static void spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; ASSERT3P(svr->svr_thread, ==, NULL); spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_free(vd->vdev_obsolete_sm, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&svr->svr_new_segments[i])); ASSERT3U(svr->svr_max_offset_to_sync[i], <=, vdev_indirect_mapping_max_offset(vim)); } for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) ASSERT0(zfs_range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) ASSERT0(zfs_range_tree_space(msp->ms_defer[i])); ASSERT0(zfs_range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_add, svr->svr_allocd_segs); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_remove, svr->svr_allocd_segs); zfs_range_tree_walk(msp->ms_freeing, zfs_range_tree_remove, svr->svr_allocd_segs); /* * Clear everything past what has been synced, * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > syncd) zfs_range_tree_clear(svr->svr_allocd_segs, syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } mutex_exit(&msp->ms_lock); mutex_enter(&svr->svr_lock); zfs_range_tree_vacate(svr->svr_allocd_segs, free_mapped_segment_cb, vd); mutex_exit(&svr->svr_lock); } /* * Note: this must happen after we invoke free_mapped_segment_cb, * because it adds to the obsolete_segments. */ zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); ASSERT3U(vic->vic_mapping_object, ==, vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = 0; ASSERT3U(vic->vic_births_object, ==, vdev_indirect_births_object(vd->vdev_indirect_births)); vdev_indirect_births_close(vd->vdev_indirect_births); vd->vdev_indirect_births = NULL; vdev_indirect_births_free(mos, vic->vic_births_object, tx); vic->vic_births_object = 0; /* * We may have processed some frees from the removing vdev in this * txg, thus increasing svr_bytes_done; discard that here to * satisfy the assertions in spa_vdev_removal_destroy(). * Note that future txg's can not have any bytes_done, because * future TXG's are only modified from open context, and we have * already shut down the copying thread. */ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; if (!vdev_prop_allocating_off(vd)) { spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_activate(vd); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx)); spa_history_log_internal(spa, "vdev remove canceled", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } static int spa_vdev_remove_cancel_impl(spa_t *spa) { int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); return (error); } int spa_vdev_remove_cancel(spa_t *spa) { spa_vdev_remove_suspend(spa); if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (spa_vdev_remove_cancel_impl(spa)); } void svr_sync(spa_t *spa, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (svr == NULL) return; /* * This check is necessary so that we do not dirty the * DIRECTORY_OBJECT via spa_sync_removing_state() when there * is nothing to do. Dirtying it every time would prevent us * from syncing-to-convergence. */ if (svr->svr_bytes_done[txgoff] == 0) return; /* * Update progress accounting. */ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; svr->svr_bytes_done[txgoff] = 0; spa_sync_removing_state(spa, tx); } static void vdev_remove_make_hole_and_free(vdev_t *vd) { uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a log device. The config lock is held for the specified TXG. */ static int spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; int error = 0; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3P(vd->vdev_log_mg, ==, NULL); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Cancel any initialize or TRIM which was in progress. */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); vdev_autotrim_stop_wait(vd); /* * Evacuate the device. We don't hold the config lock as * writer since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (vd->vdev_stat.vs_alloc != 0) error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); ASSERT3P(vd->vdev_log_mg, ==, NULL); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); /* * When the log space map feature is enabled we look at * the vdev's top_zap to find the on-disk flush data of * the metaslab we just flushed. Thus, while removing a * log vdev we make sure to call vdev_metaslab_fini() * first, which removes all metaslabs of this vdev from * spa_metaslabs_by_flushed before vdev_remove_empty() * destroys the top_zap of this log vdev. * * This avoids the scenario where we flush a metaslab * from the log vdev being removed that doesn't have a * top_zap and end up failing to lookup its on-disk flush * data. * * We don't call metaslab_group_destroy() right away * though (it will be called in vdev_free() later) as * during metaslab_sync() of metaslabs from other vdevs * we may touch the metaslab group of this vdev through * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* The top ZAP should have been destroyed by vdev_remove_empty. */ ASSERT0(vd->vdev_top_zap); /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ ASSERT0(vd->vdev_leaf_zap); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); ASSERT0(vd->vdev_stat.vs_alloc); /* * Clean up the vdev namespace. */ vdev_remove_make_hole_and_free(vd); if (ev != NULL) spa_event_post(ev); return (0); } static int spa_vdev_remove_top_check(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd != vd->vdev_top) return (SET_ERROR(ENOTSUP)); if (!vdev_is_concrete(vd)) return (SET_ERROR(ENOTSUP)); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); /* * This device is already being removed */ if (vd->vdev_removing) return (SET_ERROR(EALREADY)); metaslab_class_t *mc = vd->vdev_mg->mg_class; metaslab_class_t *normal = spa_normal_class(spa); if (mc != normal) { /* * Space allocated from the special (or dedup) class is * included in the DMU's space usage, but it's not included * in spa_dspace (or dsl_pool_adjustedsize()). Therefore * there is always at least as much free space in the normal * class, as is allocated from the special (and dedup) class. * As a backup check, we will return ENOSPC if this is * violated. See also spa_update_dspace(). */ uint64_t available = metaslab_class_get_space(normal) - metaslab_class_get_alloc(normal); ASSERT3U(available, >=, vd->vdev_stat.vs_alloc); if (available < vd->vdev_stat.vs_alloc) return (SET_ERROR(ENOSPC)); } else if (!vd->vdev_noalloc) { /* available space in the pool's normal class */ uint64_t available = dsl_dir_space_available( spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); if (available < vd->vdev_stat.vs_dspace) return (SET_ERROR(ENOSPC)); } /* * There can not be a removal in progress. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); /* * The device must have all its data. */ if (!vdev_dtl_empty(vd, DTL_MISSING) || !vdev_dtl_empty(vd, DTL_OUTAGE)) return (SET_ERROR(EBUSY)); /* * The device must be healthy. */ if (!vdev_readable(vd)) return (SET_ERROR(EIO)); /* * All vdevs in normal class must have the same ashift. */ if (spa->spa_max_ashift != spa->spa_min_ashift) { return (SET_ERROR(EINVAL)); } /* * A removed special/dedup vdev must have same ashift as normal class. */ ASSERT(!vd->vdev_islog); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && vd->vdev_ashift != spa->spa_max_ashift) { return (SET_ERROR(EINVAL)); } /* * All vdevs in normal class must have the same ashift * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; /* * A removed special/dedup vdev must have the same ashift * across all vdevs in its class. */ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && cvd->vdev_alloc_bias == vd->vdev_alloc_bias && cvd->vdev_ashift != vd->vdev_ashift) { return (SET_ERROR(EINVAL)); } if (cvd->vdev_ashift != 0 && cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); if (!vdev_is_concrete(cvd)) continue; if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only */ if (cvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < cvd->vdev_children; cid++) { if (!cvd->vdev_child[cid]->vdev_ops-> vdev_op_leaf) return (SET_ERROR(EINVAL)); } } } return (0); } /* * Initiate removal of a top-level vdev, reducing the total space in the pool. * The config lock is held for the specified TXG. Once initiated, * evacuation of all allocated space (copying it to other vdevs) happens * in the background (see spa_vdev_remove_thread()), and can be canceled * (see spa_vdev_remove_cancel()). If successful, the vdev will * be transformed to an indirect vdev (see spa_vdev_remove_complete()). */ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; boolean_t set_noalloc = B_FALSE; int error; /* * Check for errors up-front, so that we don't waste time * passivating the metaslab group and clearing the ZIL if there * are errors. */ error = spa_vdev_remove_top_check(vd); /* * Stop allocating from this vdev. Note that we must check * that this is not the only device in the pool before * passivating, otherwise we will not be able to make * progress because we can't allocate from any vdevs. * The above check for sufficient free space serves this * purpose. */ if (error == 0 && !vd->vdev_noalloc) { set_noalloc = B_TRUE; error = vdev_passivate(vd, txg); } if (error != 0) return (error); /* * We stop any initializing and TRIM that is currently in progress * but leave the state as "active". This will allow the process to * resume if the removal is canceled sometime later. */ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(vd); *txg = spa_vdev_config_enter(spa); /* * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ error = spa_vdev_remove_top_check(vd); if (error != 0) { if (set_noalloc) vdev_activate(vd); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); return (error); } vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); return (0); } /* * Remove a device from the pool. * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0, error_log; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; const char *vd_type = NULL; char *vd_path = NULL; ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { const char *type; boolean_t draid_spare = B_FALSE; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) draid_spare = B_TRUE; if (vd == NULL && draid_spare) { error = SET_ERROR(ENOTSUP); } else { if (vd == NULL) vd = spa_lookup_by_guid(spa, guid, B_TRUE); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); vd_type = VDEV_TYPE_SPARE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { vd_type = VDEV_TYPE_L2CACHE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); /* * Stop trimming the cache device. We need to release the * config lock to allow the syncing of TRIM transactions * without releasing the spa_namespace_lock. The same * strategy is employed in spa_vdev_remove_top(). */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); txg = spa_vdev_config_enter(spa); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); vd_type = VDEV_TYPE_LOG; vd_path = spa_strdup((vd->vdev_path != NULL) ? vd->vdev_path : "-"); error = spa_vdev_remove_log(vd, &txg); } else if (vd != NULL) { ASSERT(!locked); error = spa_vdev_remove_top(vd, &txg); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } error_log = error; if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); /* * Logging must be done outside the spa config lock. Otherwise, * this code path could end up holding the spa config lock while * waiting for a txg_sync so it can write to the internal log. * Doing that would prevent the txg sync from actually happening, * causing a deadlock. */ if (error_log == 0 && vd_type != NULL && vd_path != NULL) { spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path); } if (vd_path != NULL) spa_strfree(vd_path); if (ev != NULL) spa_event_post(ev); return (error); } int spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) { prs->prs_state = spa->spa_removing_phys.sr_state; if (prs->prs_state == DSS_NONE) return (SET_ERROR(ENOENT)); prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; prs->prs_start_time = spa->spa_removing_phys.sr_start_time; prs->prs_end_time = spa->spa_removing_phys.sr_end_time; prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; prs->prs_copied = spa->spa_removing_phys.sr_copied; prs->prs_mapping_memory = 0; uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != -1) { vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); indirect_vdev_id = vic->vic_prev_indirect_vdev; } return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device"); ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW, "Largest contiguous segment to allocate when removing device"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); EXPORT_SYMBOL(free_from_removing_vdev); EXPORT_SYMBOL(spa_removal_get_stats); EXPORT_SYMBOL(spa_remove_init); EXPORT_SYMBOL(spa_restart_removal); EXPORT_SYMBOL(spa_vdev_removal_destroy); EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_remove_cancel); EXPORT_SYMBOL(spa_vdev_remove_suspend); EXPORT_SYMBOL(svr_sync); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index d13753f81a69..1ca0b23c0ee4 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -1,1791 +1,1791 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP * Copyright 2023 RackTop Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include /* * TRIM is a feature which is used to notify a SSD that some previously * written space is no longer allocated by the pool. This is useful because * writes to a SSD must be performed to blocks which have first been erased. * Ensuring the SSD always has a supply of erased blocks for new writes * helps prevent the performance from deteriorating. * * There are two supported TRIM methods; manual and automatic. * * Manual TRIM: * * A manual TRIM is initiated by running the 'zpool trim' command. A single * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for * managing that vdev TRIM process. This involves iterating over all the * metaslabs, calculating the unallocated space ranges, and then issuing the * required TRIM I/Os. * * While a metaslab is being actively trimmed it is not eligible to perform * new allocations. After traversing all of the metaslabs the thread is * terminated. Finally, both the requested options and current progress of * the TRIM are regularly written to the pool. This allows the TRIM to be * suspended and resumed as needed. * * Automatic TRIM: * * An automatic TRIM is enabled by setting the 'autotrim' pool property * to 'on'. When enabled, a `vdev_autotrim' thread is created for each * top-level (not leaf) vdev in the pool. These threads perform the same * core TRIM process as a manual TRIM, but with a few key differences. * * 1) Automatic TRIM happens continuously in the background and operates * solely on recently freed blocks (ms_trim not ms_allocatable). * * 2) Each thread is associated with a top-level (not leaf) vdev. This has * the benefit of simplifying the threading model, it makes it easier * to coordinate administrative commands, and it ensures only a single * metaslab is disabled at a time. Unlike manual TRIM, this means each * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its * children. * * 3) There is no automatic TRIM progress information stored on disk, nor * is it reported by 'zpool status'. * * While the automatic TRIM process is highly effective it is more likely * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently * TRIM and are skipped. This means small amounts of freed space may not * be automatically trimmed. * * Furthermore, devices with attached hot spares and devices being actively * replaced are skipped. This is done to avoid adding additional stress to * a potentially unhealthy device and to minimize the required rebuild time. * * For this reason it may be beneficial to occasionally manually TRIM a pool * even when automatic TRIM is enabled. */ /* * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. */ static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; /* * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. */ static unsigned int zfs_trim_extent_bytes_min = 32 * 1024; /* * Skip uninitialized metaslabs during the TRIM process. This option is * useful for pools constructed from large thinly-provisioned devices where * TRIM operations are slow. As a pool ages an increasing fraction of * the pools metaslabs will be initialized progressively degrading the * usefulness of this option. This setting is stored when starting a * manual TRIM and will persist for the duration of the requested TRIM. */ unsigned int zfs_trim_metaslab_skip = 0; /* * Maximum number of queued TRIM I/Os per leaf vdev. The number of * concurrent TRIM I/Os issued to the device is controlled by the * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. */ static unsigned int zfs_trim_queue_limit = 10; /* * The minimum number of transaction groups between automatic trims of a * metaslab. This setting represents a trade-off between issuing more * efficient TRIM operations, by allowing them to be aggregated longer, * and issuing them promptly so the trimmed space is available. Note * that this value is a minimum; metaslabs can be trimmed less frequently * when there are a large number of ranges which need to be trimmed. * * Increasing this value will allow frees to be aggregated for a longer * time. This can result is larger TRIM operations, and increased memory * usage in order to track the ranges to be trimmed. Decreasing this value * has the opposite effect. The default value of 32 was determined though * testing to be a reasonable compromise. */ static unsigned int zfs_trim_txg_batch = 32; /* * The trim_args are a control structure which describe how a leaf vdev * should be trimmed. The core elements are the vdev, the metaslab being * trimmed and a range tree containing the extents to TRIM. All provided * ranges must be within the metaslab. */ typedef struct trim_args { /* * These fields are set by the caller of vdev_trim_ranges(). */ vdev_t *trim_vdev; /* Leaf vdev to TRIM */ metaslab_t *trim_msp; /* Disabled metaslab */ zfs_range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ trim_type_t trim_type; /* Manual or auto TRIM */ uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ enum trim_flag trim_flags; /* TRIM flags (secure) */ /* * These fields are updated by vdev_trim_ranges(). */ hrtime_t trim_start_time; /* Start time */ uint64_t trim_bytes_done; /* Bytes trimmed */ } trim_args_t; /* * Determines whether a vdev_trim_thread() should be stopped. */ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || vd->vdev_detached || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding); } /* * Determines whether a vdev_autotrim_thread() should be stopped. */ static boolean_t vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } /* * Wait for given number of kicks, return true if the wait is aborted due to * vdev_autotrim_exit_wanted. */ static boolean_t vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) { mutex_enter(&vd->vdev_autotrim_lock); for (int i = 0; i < num_of_kick; i++) { if (vd->vdev_autotrim_exit_wanted) break; cv_wait_idle(&vd->vdev_autotrim_kick_cv, &vd->vdev_autotrim_lock); } boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; mutex_exit(&vd->vdev_autotrim_lock); return (exit_wanted); } /* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). */ static void vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) { /* * We pass in the guid instead of the vdev_t since the vdev may * have been freed prior to the sync task being processed. This * happens when a vdev is detached as we call spa_config_vdev_exit(), * stop the trimming thread, schedule the sync task, and free * the vdev. Later when the scheduled sync task is invoked, it would * find that the vdev has been freed. */ uint64_t guid = *(uint64_t *)arg; uint64_t txg = dmu_tx_get_txg(tx); kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; vd->vdev_trim_offset[txg & TXG_MASK] = 0; VERIFY3U(vd->vdev_leaf_zap, !=, 0); objset_t *mos = vd->vdev_spa->spa_meta_objset; if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { if (vd->vdev_trim_last_offset == UINT64_MAX) last_offset = 0; vd->vdev_trim_last_offset = last_offset; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, sizeof (last_offset), 1, &last_offset, tx)); } if (vd->vdev_trim_action_time > 0) { uint64_t val = (uint64_t)vd->vdev_trim_action_time; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), 1, &val, tx)); } if (vd->vdev_trim_rate > 0) { uint64_t rate = (uint64_t)vd->vdev_trim_rate; if (rate == UINT64_MAX) rate = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); } uint64_t partial = vd->vdev_trim_partial; if (partial == UINT64_MAX) partial = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, sizeof (partial), 1, &partial, tx)); uint64_t secure = vd->vdev_trim_secure; if (secure == UINT64_MAX) secure = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, sizeof (secure), 1, &secure, tx)); uint64_t trim_state = vd->vdev_trim_state; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, sizeof (trim_state), 1, &trim_state, tx)); } /* * Update the on-disk state of a manual TRIM. This is called to request * that a TRIM be started/suspended/canceled, or to change one of the * TRIM options (partial, secure, rate). */ static void vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, uint64_t rate, boolean_t partial, boolean_t secure) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); spa_t *spa = vd->vdev_spa; if (new_state == vd->vdev_trim_state) return; /* * Copy the vd's guid, this will be freed by the sync task. */ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* * If we're suspending, then preserve the original start time. */ if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { vd->vdev_trim_action_time = gethrestime_sec(); } /* * If we're activating, then preserve the requested rate and trim * method. Setting the last offset and rate to UINT64_MAX is used * as a sentinel to indicate they should be reset to default values. */ if (new_state == VDEV_TRIM_ACTIVE) { if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || vd->vdev_trim_state == VDEV_TRIM_CANCELED) { vd->vdev_trim_last_offset = UINT64_MAX; vd->vdev_trim_rate = UINT64_MAX; vd->vdev_trim_partial = UINT64_MAX; vd->vdev_trim_secure = UINT64_MAX; } if (rate != 0) vd->vdev_trim_rate = rate; if (partial != 0) vd->vdev_trim_partial = partial; if (secure != 0) vd->vdev_trim_secure = secure; } vdev_trim_state_t old_state = vd->vdev_trim_state; boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, guid, tx); switch (new_state) { case VDEV_TRIM_ACTIVE: spa_event_notify(spa, vd, NULL, resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); spa_history_log_internal(spa, "trim", tx, "vdev=%s activated", vd->vdev_path); break; case VDEV_TRIM_SUSPENDED: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); spa_history_log_internal(spa, "trim", tx, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: if (old_state == VDEV_TRIM_ACTIVE || old_state == VDEV_TRIM_SUSPENDED) { spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); spa_history_log_internal(spa, "trim", tx, "vdev=%s canceled", vd->vdev_path); } break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); spa_history_log_internal(spa, "trim", tx, "vdev=%s complete", vd->vdev_path); break; default: panic("invalid state %llu", (unsigned long long)new_state); } dmu_tx_commit(tx); if (new_state != VDEV_TRIM_ACTIVE) spa_notify_waiters(spa); } /* * The zio_done_func_t done callback for each manual TRIM issued. It is * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, * and limiting the number of in flight TRIM I/Os. */ static void vdev_trim_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the vdev was unavailable; roll the * last offset back. (This works because spa_sync waits on * spa_txg_zio before it runs sync tasks.) */ uint64_t *offset = &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; *offset = MIN(*offset, zio->io_offset); } else { if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 1, zio->io_orig_size, 0, 0, 0, 0); } vd->vdev_trim_bytes_done += zio->io_orig_size; } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * The zio_done_func_t done callback for each automatic TRIM issued. It * is responsible for updating the TRIM stats and limiting the number of * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are * never reissued on failure. */ static void vdev_autotrim_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 1, zio->io_orig_size, 0, 0, 0, 0); } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * The zio_done_func_t done callback for each TRIM issued via * vdev_trim_simple(). It is responsible for updating the TRIM stats and * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best * effort and are never reissued on failure. */ static void vdev_trim_simple_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 1, zio->io_orig_size, 0, 0, 0, 0); } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * Returns the average trim rate in bytes/sec for the ta->trim_vdev. */ static uint64_t vdev_trim_calculate_rate(trim_args_t *ta) { return (ta->trim_bytes_done * 1000 / (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); } /* * Issues a physical TRIM and takes care of rate limiting (bytes/sec) * and number of concurrent TRIM I/Os. */ static int vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) { vdev_t *vd = ta->trim_vdev; spa_t *spa = vd->vdev_spa; void *cb; mutex_enter(&vd->vdev_trim_io_lock); /* * Limit manual TRIM I/Os to the requested rate. This does not * apply to automatic TRIM since no per vdev rate can be specified. */ if (ta->trim_type == TRIM_TYPE_MANUAL) { while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { cv_timedwait_idle(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock, ddi_get_lbolt() + MSEC_TO_TICK(10)); } } ta->trim_bytes_done += size; /* Limit in flight trimming I/Os */ while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } vd->vdev_trim_inflight[ta->trim_type]++; mutex_exit(&vd->vdev_trim_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_trim_lock); if (ta->trim_type == TRIM_TYPE_MANUAL && vd->vdev_trim_offset[txg & TXG_MASK] == 0) { uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, guid, tx); } /* * We know the vdev_t will still be around since all consumers of * vdev_free must stop the trimming first. */ if ((ta->trim_type == TRIM_TYPE_MANUAL && vdev_trim_should_stop(vd)) || (ta->trim_type == TRIM_TYPE_AUTO && vdev_autotrim_should_stop(vd->vdev_top))) { mutex_enter(&vd->vdev_trim_io_lock); vd->vdev_trim_inflight[ta->trim_type]--; mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_trim_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_trim_lock); if (ta->trim_type == TRIM_TYPE_MANUAL) vd->vdev_trim_offset[txg & TXG_MASK] = start + size; if (ta->trim_type == TRIM_TYPE_MANUAL) { cb = vdev_trim_cb; } else if (ta->trim_type == TRIM_TYPE_AUTO) { cb = vdev_autotrim_cb; } else { cb = vdev_trim_simple_cb; } zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ dmu_tx_commit(tx); return (0); } /* * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. * Additional parameters describing how the TRIM should be performed must * be set in the trim_args structure. See the trim_args definition for * additional information. */ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; zfs_btree_t *t = &ta->trim_tree->rt_root; zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { uint64_t size = zfs_rs_get_end(rs, ta->trim_tree) - zfs_rs_get_start(rs, ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, 0, 0, 1, size, 0, 0); continue; } /* Split range into legally-sized physical chunks */ uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + zfs_rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { goto done; } } } done: /* * Make sure all TRIMs for this metaslab have completed before * returning. TRIM zios have lower priority over regular or syncing * zios, so all TRIM zios for this metaslab must complete before the * metaslab is re-enabled. Otherwise it's possible write zios to * this metaslab could cut ahead of still queued TRIM zios for this * metaslab causing corruption if the ranges overlap. */ mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[0] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); return (error); } static void -vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs) { uint64_t *last_rs_end = (uint64_t *)arg; if (physical_rs->rs_end > *last_rs_end) *last_rs_end = physical_rs->rs_end; } static void -vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = (vdev_t *)arg; uint64_t size = physical_rs->rs_end - physical_rs->rs_start; vd->vdev_trim_bytes_est += size; if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { vd->vdev_trim_bytes_done += size; } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && vd->vdev_trim_last_offset <= physical_rs->rs_end) { vd->vdev_trim_bytes_done += vd->vdev_trim_last_offset - physical_rs->rs_start; } } /* * Calculates the completion percentage of a manual TRIM. */ static void vdev_trim_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); vd->vdev_trim_bytes_est = 0; vd->vdev_trim_bytes_done = 0; for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); uint64_t ms_free = (msp->ms_size - metaslab_allocated_space(msp)) / vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; /* Metaslab space after this offset has not been trimmed. */ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* Metaslab space before this offset has been trimmed */ uint64_t last_rs_end = physical_rs.rs_end; if (!vdev_xlate_is_empty(&remain_rs)) { vdev_xlate_walk(vd, &remain_rs, vdev_trim_xlate_last_rs_end, &last_rs_end); } if (vd->vdev_trim_last_offset > last_rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* * If we get here, we're in the middle of trimming this * metaslab. Load it and walk the free tree for more * accurate progress estimation. */ VERIFY0(metaslab_load(msp)); zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t idx; for (zfs_range_seg_t *rs = zfs_btree_first(bt, &idx); rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = zfs_rs_get_start(rs, rt); logical_rs.rs_end = zfs_rs_get_end(rs, rt); vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } } /* * Load from disk the vdev's manual TRIM information. This includes the * state, progress, and options provided when initiating the manual TRIM. */ static int vdev_trim_load(vdev_t *vd) { int err = 0; ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, sizeof (vd->vdev_trim_last_offset), 1, &vd->vdev_trim_last_offset); if (err == ENOENT) { vd->vdev_trim_last_offset = 0; err = 0; } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, sizeof (vd->vdev_trim_rate), 1, &vd->vdev_trim_rate); if (err == ENOENT) { vd->vdev_trim_rate = 0; err = 0; } } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, sizeof (vd->vdev_trim_partial), 1, &vd->vdev_trim_partial); if (err == ENOENT) { vd->vdev_trim_partial = 0; err = 0; } } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, sizeof (vd->vdev_trim_secure), 1, &vd->vdev_trim_secure); if (err == ENOENT) { vd->vdev_trim_secure = 0; err = 0; } } } vdev_trim_calculate_progress(vd); return (err); } static void -vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; /* * Only a manual trim will be traversing the vdev sequentially. * For an auto trim all valid ranges should be added. */ if (ta->trim_type == TRIM_TYPE_MANUAL) { /* Only add segments that we have not visited yet */ if (physical_rs->rs_end <= vd->vdev_trim_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_trim_last_offset > physical_rs->rs_start) { ASSERT3U(physical_rs->rs_end, >, vd->vdev_trim_last_offset); physical_rs->rs_start = vd->vdev_trim_last_offset; } } ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); zfs_range_tree_add(ta->trim_tree, physical_rs->rs_start, physical_rs->rs_end - physical_rs->rs_start); } /* * Convert the logical range into physical ranges and add them to the * range tree passed in the trim_args_t. */ static void vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg64_t logical_rs; + zfs_range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; /* * Every range to be trimmed must be part of ms_allocatable. * When ZFS_DEBUG_TRIM is set load the metaslab to verify this * is always the case. */ if (zfs_flags & ZFS_DEBUG_TRIM) { metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); } /* * Each manual TRIM thread is responsible for trimming the unallocated * space for each leaf vdev. This is accomplished by sequentially iterating * over its top-level metaslabs and issuing TRIM I/O for the space described * by its ms_allocatable. While a metaslab is undergoing trimming it is * not eligible for new allocations. */ static __attribute__((noreturn)) void vdev_trim_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; trim_args_t ta; int error = 0; /* * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by * vdev_trim(). Wait for the updated values to be reflected * in the zap in order to start with the requested settings. */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_trim_last_offset = 0; vd->vdev_trim_rate = 0; vd->vdev_trim_partial = 0; vd->vdev_trim_secure = 0; VERIFY0(vdev_trim_load(vd)); ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; /* * When a secure TRIM has been requested infer that the intent * is that everything must be trimmed. Override the default * minimum TRIM size to prevent ranges from being skipped. */ if (vd->vdev_trim_secure) { ta.trim_flags |= ZIO_TRIM_SECURE; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; } uint64_t ms_count = 0; for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; /* * If we've expanded the top-level vdev or it's our * first pass, calculate our progress. */ if (vd->vdev_top->vdev_ms_count != ms_count) { vdev_trim_calculate_progress(vd); ms_count = vd->vdev_top->vdev_ms_count; } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); /* * If a partial TRIM was requested skip metaslabs which have * never been initialized and thus have never been written. */ if (msp->ms_sm == NULL && vd->vdev_trim_partial) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_calculate_progress(vd); continue; } ta.trim_msp = msp; zfs_range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); error = vdev_trim_ranges(&ta); metaslab_enable(msp, B_TRUE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); zfs_range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted) { if (vdev_writeable(vd)) { vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } else if (vd->vdev_faulted) { vdev_trim_change_state(vd, VDEV_TRIM_CANCELED, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } } ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); /* * Drop the vdev_trim_lock while we sync out the txg since it's * possible that a device might be trying to come online and must * check to see if it needs to restart a trim. That thread will be * holding the spa_config_lock which would prevent the txg_wait_synced * from completing. */ mutex_exit(&vd->vdev_trim_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&vd->vdev_trim_lock); vd->vdev_trim_thread = NULL; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); thread_exit(); } /* * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, * the vdev_t must be a leaf and cannot already be manually trimming. */ void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } /* * Wait for the trimming thread to be terminated (canceled or stopped). */ static void vdev_trim_stop_wait_impl(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); while (vd->vdev_trim_thread != NULL) cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); ASSERT3P(vd->vdev_trim_thread, ==, NULL); vd->vdev_trim_exit_wanted = B_FALSE; } /* * Wait for vdev trim threads which were listed to cleanly exit. */ void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) { (void) spa; vdev_t *vd; ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop_wait_impl(vd); mutex_exit(&vd->vdev_trim_lock); } } /* * Stop trimming a device, with the resultant trimming state being tgt_state. * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is * provided the stopping vdev is inserted in to the list. Callers are then * required to call vdev_trim_stop_wait() to block for all the trim threads * to exit. The caller must hold vdev_trim_lock and must not be writing to * the spa config, as the trimming thread may try to enter the config as a * reader before exiting. */ void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) { ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); /* * Allow cancel requests to proceed even if the trim thread has * stopped. */ if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) return; vdev_trim_change_state(vd, tgt_state, 0, 0, 0); vd->vdev_trim_exit_wanted = B_TRUE; if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { ASSERT(MUTEX_HELD(&spa_namespace_lock) || vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } /* * Requests that all listed vdevs stop trimming. */ static void vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, tgt_state, vd_list); mutex_exit(&vd->vdev_trim_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, vd_list); } } /* * Convenience function to stop trimming of a vdev tree and set all trim * thread pointers to NULL. */ void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; vdev_t *vd_l2cache; ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); /* * Iterate over cache devices and request stop trimming the * whole device in case we export the pool or remove the cache * device prematurely. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); } vdev_trim_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } list_destroy(&vd_list); } /* * Conditionally restarts a manual TRIM given its on-disk state. */ void vdev_trim_restart(vdev_t *vd) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { mutex_enter(&vd->vdev_trim_lock); uint64_t trim_state = VDEV_TRIM_NONE; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, sizeof (trim_state), 1, &trim_state); ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_state = trim_state; uint64_t timestamp = 0; err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_trim_restart(vd->vdev_child[i]); } } /* * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that * every TRIM range is contained within ms_allocatable. */ static void vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; metaslab_t *msp = ta->trim_msp; VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start, size)); } /* * Each automatic TRIM thread is responsible for managing the trimming of a * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. * * N.B. This behavior is different from a manual TRIM where a thread * is created for each leaf vdev, instead of each top-level vdev. */ static __attribute__((noreturn)) void vdev_autotrim_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int shift = 0; mutex_enter(&vd->vdev_autotrim_lock); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); mutex_exit(&vd->vdev_autotrim_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; /* * All of the metaslabs are divided in to groups of size * num_metaslabs / zfs_trim_txg_batch. Each of these groups * is composed of metaslabs which are spread evenly over the * device. * * For example, when zfs_trim_txg_batch = 32 (default) then * group 0 will contain metaslabs 0, 32, 64, ...; * group 1 will contain metaslabs 1, 33, 65, ...; * group 2 will contain metaslabs 2, 34, 66, ...; and so on. * * On each pass through the while() loop one of these groups * is selected. This is accomplished by using a shift value * to select the starting metaslab, then striding over the * metaslabs using the zfs_trim_txg_batch size. This is * done to accomplish two things. * * 1) By dividing the metaslabs in to groups, and making sure * that each group takes a minimum of one txg to process. * Then zfs_trim_txg_batch controls the minimum number of * txgs which must occur before a metaslab is revisited. * * 2) Selecting non-consecutive metaslabs distributes the * TRIM commands for a group evenly over the entire device. * This can be advantageous for certain types of devices. */ for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; zfs_range_tree_t *trim_tree; boolean_t issued_trim = B_FALSE; boolean_t wait_aborted = B_FALSE; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&msp->ms_lock); /* * Skip the metaslab when it has never been allocated * or when there are no recent frees to trim. */ if (msp->ms_sm == NULL || zfs_range_tree_is_empty(msp->ms_trim)) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } /* * Skip the metaslab when it has already been disabled. * This may happen when a manual TRIM or initialize * operation is running concurrently. In the case * of a manual TRIM, the ms_trim tree will have been * vacated. Only ranges added after the manual TRIM * disabled the metaslab will be included in the tree. * These will be processed when the automatic TRIM * next revisits this metaslab. */ if (msp->ms_disabled > 1) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } /* * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); zfs_range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(zfs_range_tree_is_empty(msp->ms_trim)); /* * There are two cases when constructing the per-vdev * trim trees for a metaslab. If the top-level vdev * has no children then it is also a leaf and should * be trimmed. Otherwise our children are the leaves * and a trim tree should be constructed for each. */ trim_args_t *tap; uint64_t children = vd->vdev_children; if (children == 0) { children = 1; tap = kmem_zalloc(sizeof (trim_args_t) * children, KM_SLEEP); tap[0].trim_vdev = vd; } else { tap = kmem_zalloc(sizeof (trim_args_t) * children, KM_SLEEP); for (uint64_t c = 0; c < children; c++) { tap[c].trim_vdev = vd->vdev_child[c]; } } for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; vdev_t *cvd = ta->trim_vdev; ta->trim_msp = msp; ta->trim_extent_bytes_max = extent_bytes_max; ta->trim_extent_bytes_min = extent_bytes_min; ta->trim_type = TRIM_TYPE_AUTO; ta->trim_flags = 0; if (cvd->vdev_detached || !vdev_writeable(cvd) || !cvd->vdev_has_trim || cvd->vdev_trim_thread != NULL) { continue; } /* * When a device has an attached hot spare, or * is being replaced it will not be trimmed. * This is done to avoid adding additional * stress to a potentially unhealthy device, * and to minimize the required rebuild time. */ if (!cvd->vdev_ops->vdev_op_leaf) continue; ta->trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); zfs_range_tree_walk(trim_tree, vdev_trim_range_add, ta); } mutex_exit(&msp->ms_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Issue the TRIM I/Os for all ranges covered by the * TRIM trees. These ranges are safe to TRIM because * no new allocations will be performed until the call * to metaslab_enabled() below. */ for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; /* * Always yield to a manual TRIM if one has * been started for the child vdev. */ if (ta->trim_tree == NULL || ta->trim_vdev->vdev_trim_thread != NULL) { continue; } /* * After this point metaslab_enable() must be * called with the sync flag set. This is done * here because vdev_trim_ranges() is allowed * to be interrupted (EINTR) before issuing all * of the required TRIM I/Os. */ issued_trim = B_TRUE; int error = vdev_trim_ranges(ta); if (error) break; } /* * Verify every range which was trimmed is still * contained within the ms_allocatable tree. */ if (zfs_flags & ZFS_DEBUG_TRIM) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); VERIFY3P(tap[0].trim_msp, ==, msp); zfs_range_tree_walk(trim_tree, vdev_trim_range_verify, &tap[0]); mutex_exit(&msp->ms_lock); } zfs_range_tree_vacate(trim_tree, NULL, NULL); zfs_range_tree_destroy(trim_tree); /* * Wait for couples of kicks, to ensure the trim io is * synced. If the wait is aborted due to * vdev_autotrim_exit_wanted, we need to signal * metaslab_enable() to wait for sync. */ if (issued_trim) { wait_aborted = vdev_autotrim_wait_kick(vd, TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); } metaslab_enable(msp, wait_aborted, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; if (ta->trim_tree == NULL) continue; zfs_range_tree_vacate(ta->trim_tree, NULL, NULL); zfs_range_tree_destroy(ta->trim_tree); } kmem_free(tap, sizeof (trim_args_t) * children); if (vdev_autotrim_should_stop(vd)) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); vdev_autotrim_wait_kick(vd, 1); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_trim_io_lock); while (cvd->vdev_trim_inflight[1] > 0) { cv_wait(&cvd->vdev_trim_io_cv, &cvd->vdev_trim_io_lock); } mutex_exit(&cvd->vdev_trim_io_lock); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * When exiting because the autotrim property was set to off, then * abandon any unprocessed ms_trim ranges to reclaim the memory. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); } } mutex_enter(&vd->vdev_autotrim_lock); ASSERT(vd->vdev_autotrim_thread != NULL); vd->vdev_autotrim_thread = NULL; cv_broadcast(&vd->vdev_autotrim_cv); mutex_exit(&vd->vdev_autotrim_lock); thread_exit(); } /* * Starts an autotrim thread, if needed, for each top-level vdev which can be * trimmed. A top-level vdev which has been evacuated will never be trimmed. */ void vdev_autotrim(spa_t *spa) { vdev_t *root_vd = spa->spa_root_vdev; for (uint64_t i = 0; i < root_vd->vdev_children; i++) { vdev_t *tvd = root_vd->vdev_child[i]; mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && tvd->vdev_autotrim_thread == NULL && !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, maxclsyspri); ASSERT(tvd->vdev_autotrim_thread != NULL); } mutex_exit(&tvd->vdev_autotrim_lock); } } /* * Wait for the vdev_autotrim_thread associated with the passed top-level * vdev to be terminated (canceled or stopped). */ void vdev_autotrim_stop_wait(vdev_t *tvd) { mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; cv_broadcast(&tvd->vdev_autotrim_kick_cv); cv_wait(&tvd->vdev_autotrim_cv, &tvd->vdev_autotrim_lock); ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; } mutex_exit(&tvd->vdev_autotrim_lock); } void vdev_autotrim_kick(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); vdev_t *root_vd = spa->spa_root_vdev; vdev_t *tvd; for (uint64_t i = 0; i < root_vd->vdev_children; i++) { tvd = root_vd->vdev_child[i]; mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) cv_broadcast(&tvd->vdev_autotrim_kick_cv); mutex_exit(&tvd->vdev_autotrim_lock); } } /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped). */ void vdev_autotrim_stop_all(spa_t *spa) { vdev_t *root_vd = spa->spa_root_vdev; for (uint64_t i = 0; i < root_vd->vdev_children; i++) vdev_autotrim_stop_wait(root_vd->vdev_child[i]); } /* * Conditionally restart all of the vdev_autotrim_thread's for the pool. */ void vdev_autotrim_restart(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } static __attribute__((noreturn)) void vdev_trim_l2arc_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; l2arc_dev_t *dev = l2arc_vdev_get(vd); trim_args_t ta = {0}; - range_seg64_t physical_rs; + zfs_range_seg64_t physical_rs; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_trim_last_offset = 0; vd->vdev_trim_rate = 0; vd->vdev_trim_partial = 0; vd->vdev_trim_secure = 0; ta.trim_vdev = vd; ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; ta.trim_flags = 0; physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; physical_rs.rs_end = vd->vdev_trim_bytes_est = vdev_get_min_asize(vd); zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); mutex_enter(&vd->vdev_trim_lock); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); mutex_exit(&vd->vdev_trim_lock); (void) vdev_trim_ranges(&ta); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); zfs_range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); /* * Drop the vdev_trim_lock while we sync out the txg since it's * possible that a device might be trying to come online and * must check to see if it needs to restart a trim. That thread * will be holding the spa_config_lock which would prevent the * txg_wait_synced from completing. Same strategy as in * vdev_trim_thread(). */ mutex_exit(&vd->vdev_trim_lock); txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); mutex_enter(&vd->vdev_trim_lock); /* * Update the header of the cache device here, before * broadcasting vdev_trim_cv which may lead to the removal * of the device. The same applies for setting l2ad_trim_all to * false. */ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, RW_READER); memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); vd->vdev_trim_thread = NULL; if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) dev->l2ad_trim_all = B_FALSE; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); thread_exit(); } /* * Punches out TRIM threads for the L2ARC devices in a spa and assigns them * to vd->vdev_trim_thread variable. This facilitates the management of * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition * to a pool or pool creation or when the header of the device is invalid. */ void vdev_trim_l2arc(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off TRIM threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; l2arc_dev_t *dev = l2arc_vdev_get(vd); if (dev == NULL || !dev->l2ad_trim_all) { /* * Don't attempt TRIM if the vdev is UNAVAIL or if the * cache device was not marked for whole device TRIM * (ie l2arc_trim_ahead = 0, or the L2ARC device header * is valid with trim_state = VDEV_TRIM_COMPLETE and * l2ad_log_entries > 0). */ continue; } mutex_enter(&vd->vdev_trim_lock); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); vd->vdev_trim_thread = thread_create(NULL, 0, vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_trim_lock); } } /* * A wrapper which calls vdev_trim_ranges(). It is intended to be called * on leaf vdevs. */ int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) { trim_args_t ta = {0}; - range_seg64_t physical_rs; + zfs_range_seg64_t physical_rs; int error; physical_rs.rs_start = start; physical_rs.rs_end = start + size; ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; ta.trim_flags = 0; ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); if (physical_rs.rs_end > physical_rs.rs_start) { zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); } error = vdev_trim_ranges(&ta); mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); zfs_range_tree_destroy(ta.trim_tree); return (error); } EXPORT_SYMBOL(vdev_trim); EXPORT_SYMBOL(vdev_trim_stop); EXPORT_SYMBOL(vdev_trim_stop_all); EXPORT_SYMBOL(vdev_trim_stop_wait); EXPORT_SYMBOL(vdev_trim_restart); EXPORT_SYMBOL(vdev_autotrim); EXPORT_SYMBOL(vdev_autotrim_stop_all); EXPORT_SYMBOL(vdev_autotrim_stop_wait); EXPORT_SYMBOL(vdev_autotrim_restart); EXPORT_SYMBOL(vdev_trim_l2arc); EXPORT_SYMBOL(vdev_trim_simple); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, "Max size of TRIM commands, larger will be split"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, "Min size of TRIM commands, smaller will be skipped"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, "Skip metaslabs which have never been initialized"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, "Min number of txgs to aggregate frees before issuing TRIM"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, "Max queued TRIMs outstanding per leaf vdev");