diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 41c2b6765585..8e3b6972ae04 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1,9776 +1,9811 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright (c) 2015, 2017, Intel Corporation. * Copyright (c) 2020 Datto Inc. * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdb.h" extern int reference_tracking_enable; extern int zfs_recover; extern uint_t zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_metaslab = NULL; static unsigned zopt_metaslab_args = 0; static zopt_object_range_t *zopt_object_ranges = NULL; static unsigned zopt_object_args = 0; static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; static spa_t *spa; static objset_t *os; static boolean_t kernel_init_done; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); static void zdb_print_blkptr(const blkptr_t *bp, int flags); static void zdb_exit(int reason); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ blkptr_t svbr_blk; /* * Refcount gets incremented to 1 when we encounter the first * FREE entry for the svfbr block pointer and a node for it * is created in our ZDB verification/tracking metadata. * * As we encounter more FREE entries we increment this counter * and similarly decrement it whenever we find the respective * ALLOC entries for this block. * * When the refcount gets to 0 it means that all the FREE and * ALLOC entries of this block have paired up and we no longer * need to track it in our verification logic (e.g. the node * containing this struct in our verification data structure * should be freed). * * [refer to sublivelist_verify_blkptr() for the actual code] */ uint32_t svbr_refcnt; } sublivelist_verify_block_refcnt_t; static int sublivelist_block_refcnt_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_refcnt_t *l = larg; const sublivelist_verify_block_refcnt_t *r = rarg; return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); } static int sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx) { ASSERT3P(tx, ==, NULL); struct sublivelist_verify *sv = arg; sublivelist_verify_block_refcnt_t current = { .svbr_blk = *bp, /* * Start with 1 in case this is the first free entry. * This field is not used for our B-Tree comparisons * anyway. */ .svbr_refcnt = 1, }; zfs_btree_index_t where; sublivelist_verify_block_refcnt_t *pair = zfs_btree_find(&sv->sv_pair, ¤t, &where); if (free) { if (pair == NULL) { /* first free entry for this block pointer */ zfs_btree_add(&sv->sv_pair, ¤t); } else { pair->svbr_refcnt++; } } else { if (pair == NULL) { /* block that is currently marked as allocated */ for (int i = 0; i < SPA_DVAS_PER_BP; i++) { if (DVA_IS_EMPTY(&bp->blk_dva[i])) break; sublivelist_verify_block_t svb = { .svb_dva = bp->blk_dva[i], .svb_allocated_txg = BP_GET_LOGICAL_BIRTH(bp) }; if (zfs_btree_find(&sv->sv_leftover, &svb, &where) == NULL) { zfs_btree_add_idx(&sv->sv_leftover, &svb, &where); } } } else { /* alloc matches a free entry */ pair->svbr_refcnt--; if (pair->svbr_refcnt == 0) { /* all allocs and frees have been matched */ zfs_btree_remove_idx(&sv->sv_pair, &where); } } } return (0); } static int sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) { int err; struct sublivelist_verify *sv = args; zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, sv, NULL); sublivelist_verify_block_refcnt_t *e; zfs_btree_index_t *cookie = NULL; while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &e->svbr_blk, B_TRUE); (void) printf("\tERROR: %d unmatched FREE(s): %s\n", e->svbr_refcnt, blkbuf); } zfs_btree_destroy(&sv->sv_pair); return (err); } static int livelist_block_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_t *l = larg; const sublivelist_verify_block_t *r = rarg; if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) return (-1); else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) return (+1); if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) return (-1); else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) return (+1); if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) return (-1); else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) return (+1); return (0); } /* * Check for errors in a livelist while tracking all unfreed ALLOCs in the * sublivelist_verify_t: sv->sv_leftover */ static void livelist_verify(dsl_deadlist_t *dl, void *arg) { sublivelist_verify_t *sv = arg; dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); } /* * Check for errors in the livelist entry and discard the intermediary * data structures */ static int sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); zfs_btree_destroy(&sv.sv_leftover); return (err); } typedef struct metaslab_verify { /* * Tree containing all the leftover ALLOCs from the livelists * that are part of this metaslab. */ zfs_btree_t mv_livelist_allocs; /* * Metaslab information. */ uint64_t mv_vdid; uint64_t mv_msid; uint64_t mv_start; uint64_t mv_end; /* * What's currently allocated for this metaslab. */ range_tree_t *mv_allocated; } metaslab_verify_t; typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg); typedef struct unflushed_iter_cb_arg { spa_t *uic_spa; uint64_t uic_txg; void *uic_arg; zdb_log_sm_cb_t uic_cb; } unflushed_iter_cb_arg_t; static int iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) { unflushed_iter_cb_arg_t *uic = arg; return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); } static void iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); unflushed_iter_cb_arg_t uic = { .uic_spa = spa, .uic_txg = sls->sls_txg, .uic_arg = arg, .uic_cb = cb }; VERIFY0(space_map_iterate(sm, space_map_length(sm), iterate_through_spacemap_logs_cb, &uic)); space_map_close(sm); } spa_config_exit(spa, SCL_CONFIG, FTAG); } static void verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, uint64_t offset, uint64_t size) { sublivelist_verify_block_t svb = {{{0}}}; DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); DVA_SET_OFFSET(&svb.svb_dva, offset); DVA_SET_ASIZE(&svb.svb_dva, size); zfs_btree_index_t where; uint64_t end_offset = offset + size; /* * Look for an exact match for spacemap entry in the livelist entries. * Then, look for other livelist entries that fall within the range * of the spacemap entry as it may have been condensed */ sublivelist_verify_block_t *found = zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); if (found == NULL) { found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); } for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && DVA_GET_OFFSET(&found->svb_dva) < end_offset; found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { if (found->svb_allocated_txg <= txg) { (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " "from TXG %llx FREED at TXG %llx\n", (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), (u_longlong_t)found->svb_allocated_txg, (u_longlong_t)txg); } } } static int metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t txg = sme->sme_txg; if (sme->sme_type == SM_ALLOC) { if (range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE ALLOC: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { range_tree_add(mv->mv_allocated, offset, size); } } else { if (!range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE FREE: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { range_tree_remove(mv->mv_allocated, offset, size); } } if (sme->sme_type != SM_ALLOC) { /* * If something is freed in the spacemap, verify that * it is not listed as allocated in the livelist. */ verify_livelist_allocs(mv, txg, offset, size); } return (0); } static int spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); if (vdev_id != mv->mv_vdid) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; if (ms->ms_id != mv->mv_msid) return (0); if (txg < metaslab_unflushed_txg(ms)) return (0); ASSERT3U(txg, ==, sme->sme_txg); return (metaslab_spacemap_validation_cb(sme, mv)); } static void spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) { iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); } static void spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) { if (sm == NULL) return; VERIFY0(space_map_iterate(sm, space_map_length(sm), metaslab_spacemap_validation_cb, mv)); } static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); /* * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if * they are part of that metaslab (mv_msid). */ static void mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) { zfs_btree_index_t where; sublivelist_verify_block_t *svb; ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); for (svb = zfs_btree_first(&sv->sv_leftover, &where); svb != NULL; svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) continue; if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && (DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) continue; if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) continue; if ((DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } zfs_btree_add(&mv->mv_livelist_allocs, svb); } for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); svb != NULL; svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { zfs_btree_remove(&sv->sv_leftover, svb); } } /* * [Livelist Check] * Iterate through all the sublivelists and: * - report leftover frees (**) * - record leftover ALLOCs together with their TXG [see Cross Check] * * (**) Note: Double ALLOCs are valid in datasets that have dedup * enabled. Similarly double FREEs are allowed as well but * only if they pair up with a corresponding ALLOC entry once * we our done with our sublivelist iteration. * * [Spacemap Check] * for each metaslab: * - iterate over spacemap and then the metaslab's entries in the * spacemap log, then report any double FREEs and ALLOCs (do not * blow up). * * [Cross Check] * After finishing the Livelist Check phase and while being in the * Spacemap Check phase, we find all the recorded leftover ALLOCs * of the livelist check that are part of the metaslab that we are * currently looking at in the Spacemap Check. We report any entries * that are marked as ALLOCs in the livelists but have been actually * freed (and potentially allocated again) after their TXG stamp in * the spacemaps. Also report any ALLOCs from the livelists that * belong to indirect vdevs (e.g. their vdev completed removal). * * Note that this will miss Log Spacemap entries that cancelled each other * out before being flushed to the metaslab, so we are not guaranteed * to match all erroneous ALLOCs. */ static void livelist_metaslab_validate(spa_t *spa) { (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); (void) printf("Verifying metaslab entries\n"); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { metaslab_t *m = vd->vdev_ms[mid]; (void) fprintf(stderr, "\rverifying concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)mid, (longlong_t)vd->vdev_ms_count); uint64_t shift, start; range_seg_type_t type = metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; mv.mv_allocated = range_tree_create(NULL, type, NULL, start, shift); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); spacemap_check_ms_sm(m->ms_sm, &mv); spacemap_check_sm_log(spa, &mv); range_tree_vacate(mv.mv_allocated, NULL, NULL); range_tree_destroy(mv.mv_allocated); zfs_btree_clear(&mv.mv_livelist_allocs); zfs_btree_destroy(&mv.mv_livelist_allocs); } } (void) fprintf(stderr, "\n"); /* * If there are any segments in the leftover tree after we walked * through all the metaslabs in the concrete vdevs then this means * that we have segments in the livelists that belong to indirect * vdevs and are marked as allocated. */ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { zfs_btree_destroy(&sv.sv_leftover); return; } (void) printf("ERROR: Found livelist blocks marked as allocated " "for indirect vdevs:\n"); zfs_btree_index_t *where = NULL; sublivelist_verify_block_t *svb; while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != NULL) { int vdev_id = DVA_GET_VDEV(&svb->svb_dva); ASSERT3U(vdev_id, <, rvd->vdev_children); vdev_t *vd = rvd->vdev_child[vdev_id]; ASSERT(!vdev_is_concrete(vd)); (void) printf("<%d:%llx:%llx> TXG %llx\n", vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), (u_longlong_t)svb->svb_allocated_txg); } (void) printf("\n"); zfs_btree_destroy(&sv.sv_leftover); } /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ]\n" "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" "\t%s -B [-e [-V] [-p ...]] [-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ] / []\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ] []\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O [-K ] \n" "\t%s -r [-K ] \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers or object number " "ranges are specified, only those\n" " objects or ranges are dumped.\n\n"); (void) fprintf(stderr, " Object ranges take the form :[:]\n" " start Starting object number\n" " end Ending object number, or -1 for no upper bound\n" " flags Optional flags to select object types:\n" " A All objects (this is the default)\n" " d ZFS directories\n" " f ZFS files \n" " m SPA space maps\n" " z ZAPs\n" " - Negate effect of next flag\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); (void) fprintf(stderr, " -B --backup " "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " "config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d --datasets " "dataset(s)\n"); (void) fprintf(stderr, " -D --dedup-stats " "dedup statistics\n"); (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n" " decode and display block " "from an embedded block pointer\n"); (void) fprintf(stderr, " -h --history " "pool history\n"); (void) fprintf(stderr, " -i --intent-logs " "intent logs\n"); (void) fprintf(stderr, " -l --label " "read label contents\n"); (void) fprintf(stderr, " -k --checkpointed-state " "examine the checkpointed state of the pool\n"); (void) fprintf(stderr, " -L --disable-leak-tracking " "disable leak tracking (do not load spacemaps)\n"); (void) fprintf(stderr, " -m --metaslabs " "metaslabs\n"); (void) fprintf(stderr, " -M --metaslab-groups " "metaslab groups\n"); (void) fprintf(stderr, " -O --object-lookups " "perform object lookups by path\n"); (void) fprintf(stderr, " -r --copy-object " "copy an object by path to file\n"); (void) fprintf(stderr, " -R --read-block " "read and display block from a device\n"); (void) fprintf(stderr, " -s --io-stats " "report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S --simulate-dedup " "simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v --verbose " "verbose (applies to all others)\n"); (void) fprintf(stderr, " -y --livelist " "perform livelist and metaslab validation on any livelists being " "deleted\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A --ignore-assertions " "ignore assertions (-A), enable panic recovery (-AA) or both " "(-AAA)\n"); (void) fprintf(stderr, " -e --exported " "pool is exported/destroyed/has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F --automatic-rewind " "attempt automatic rewind within safe range of transaction " "groups\n"); (void) fprintf(stderr, " -G --dump-debug-msg " "dump zfs_dbgmsg buffer before exiting\n"); (void) fprintf(stderr, " -I --inflight=INTEGER " "specify the maximum number of checksumming I/Os " "[default is 200]\n"); (void) fprintf(stderr, " -K --key=KEY " "decryption key for encrypted dataset\n"); (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" " "set global variable to an unsigned 32-bit integer\n"); (void) fprintf(stderr, " -p --path==PATH " "use one or more with -e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P --parseable " "print numbers in parseable form\n"); (void) fprintf(stderr, " -q --skip-label " "don't print label contents\n"); (void) fprintf(stderr, " -t --txg=INTEGER " "highest txg to use when searching for uberblocks\n"); (void) fprintf(stderr, " -T --brt-stats " "BRT statistics\n"); (void) fprintf(stderr, " -u --uberblock " "uberblock\n"); (void) fprintf(stderr, " -U --cachefile=PATH " "use alternate cachefile\n"); (void) fprintf(stderr, " -V --verbatim " "do verbatim import\n"); (void) fprintf(stderr, " -x --dump-blocks=PATH " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X --extreme-rewind " "attempt extreme rewind (does not work with dataset)\n"); (void) fprintf(stderr, " -Y --all-reconstruction " "attempt all reconstruction combinations for split blocks\n"); (void) fprintf(stderr, " -Z --zstd-headers " "show ZSTD headers \n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); zdb_exit(1); } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!dump_opt['G']) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDERR_FILENO, "\n", 1); zfs_dbgmsg_print(STDERR_FILENO, "zdb"); } static void sig_handler(int signo) { struct sigaction action; libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); zdb_exit(1); } static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { (void) size; nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) size; spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, buflen); } static void zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes); else zfs_nicebytes(bytes, buf, buflen); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] == 0) continue; if (histo[i] > max) max = histo[i]; if (i > maxidx) maxidx = i; if (i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; (void) printf("\tUNKNOWN OBJECT TYPE\n"); } static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { uint64_t *arr; uint64_t oursize; if (dump_opt['d'] < 6) return; if (data == NULL) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); size = doi.doi_max_offset; /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); arr = kmem_alloc(oursize, KM_SLEEP); int err = dmu_read(os, object, 0, oursize, arr, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(arr, oursize); return; } } else { /* * Even though the allocation is already done in this code path, * we still cap the size to prevent excessive printing. */ oursize = MIN(size, 1 << 20); arr = data; } if (size == 0) { if (data == NULL) kmem_free(arr, oursize); (void) printf("\t\t[]\n"); return; } (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { if (i % 4 != 0) (void) printf(", %0llx", (u_longlong_t)arr[i]); else (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); } if (oursize != size) (void) printf(", ... "); (void) printf("]\n"); if (data == NULL) kmem_free(arr, oursize); } static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t attr; void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { boolean_t key64 = !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); if (key64) (void) printf("\t\t0x%010lx = ", *(uint64_t *)attr.za_name); else (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); if (key64) (void) zap_lookup_uint64(os, object, (const uint64_t *)attr.za_name, 1, attr.za_integer_length, attr.za_num_integers, prop); else (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1 && !key64) { if (strcmp(attr.za_name, DSL_CRYPTO_KEY_MASTER_KEY) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_HMAC_KEY) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { uint8_t *u8 = prop; for (i = 0; i < attr.za_num_integers; i++) { (void) printf("%02x", u8[i]); } } else { (void) printf("%s", (char *)prop); } } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 1: (void) printf("%u ", ((uint8_t *)prop)[i]); break; case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; uint64_t i; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (size >= sizeof (*bpop)) { (void) printf("\t\tnum_freed = %llu\n", (u_longlong_t)bpop->bpo_num_freed); } if (dump_opt['d'] < 5) return; for (i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; dmu_object_info_t doi; int64_t i; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { uint64_t obsolete_sm_object; int refcount = 0; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (vd->vdev_top == vd && obsolete_sm_object != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_object, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_object, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int get_log_spacemap_refcount(spa_t *spa) { return (avl_numnodes(&spa->spa_sm_logs_by_txg)); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", (longlong_t)sm->sm_object); (void) printf(" smp_length = 0x%llx\n", (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); if (dump_opt['d'] < 6 && dump_opt['m'] < 4) return; /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); if (de_txg == 0) { (void) printf( "\t [%6llu] PADDING\n", (u_longlong_t)entry_id); } else { (void) printf( "\t [%6llu] %s: txg %llu pass %llu\n", (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)de_txg, (u_longlong_t)de_sync_pass); } entry_id++; continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; entry_id++; } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", (u_longlong_t)metaslab_unflushed_txg(msp)); } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *bias_str = ""; if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { bias_str = VDEV_ALLOC_BIAS_LOG; } else if (alloc_bias == VDEV_BIAS_SPECIAL) { bias_str = VDEV_ALLOC_BIAS_SPECIAL; } else if (alloc_bias == VDEV_BIAS_DEDUP) { bias_str = VDEV_ALLOC_BIAS_DEDUP; } uint64_t ms_flush_data_obj = 0; if (vd->vdev_top_zap != 0) { int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &ms_flush_data_obj); if (error != ENOENT) { ASSERT0(error); } } (void) printf("\tvdev %10llu %s", (u_longlong_t)vd->vdev_id, bias_str); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", (u_longlong_t)ms_flush_data_obj); } (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", "---------------", "------------"); } static void dump_metaslab_groups(spa_t *spa, boolean_t show_special) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); metaslab_class_t *smc = spa_special_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || (mg->mg_class != mc && (!show_special || mg->mg_class != smc))) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_metaslab_args > 0) { c = zopt_metaslab[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_metaslab_args > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_metaslab_args; m++) { if (zopt_metaslab[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_metaslab[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_metaslab[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_log_spacemaps(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; (void) printf("\nLog Space Maps in Pool:\n"); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); (void) printf("Log Spacemap object %llu txg %llu\n", (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); dump_spacemap(spa->spa_meta_objset, sm); space_map_close(sm); } (void) printf("\n"); } static void dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, uint64_t index) { const ddt_key_t *ddk = &ddlwe->ddlwe_key; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; for (p = 0; p < DDT_NPHYS(ddt); p++) { const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu phys %d %s\n", (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), p, blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt_log(ddt_t *ddt) { for (int n = 0; n < 2; n++) { ddt_log_t *ddl = &ddt->ddt_log[n]; uint64_t count = avl_numnodes(&ddl->ddl_tree); if (count == 0) continue; printf(DMU_POOL_DDT_LOG ": %lu log entries\n", zio_checksum_table[ddt->ddt_checksum].ci_name, n, count); if (dump_opt['D'] < 4) continue; ddt_lightweight_entry_t ddlwe; uint64_t index = 0; for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); dump_ddt_entry(ddt, &ddlwe, index++); } } } static void dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; ddt_lightweight_entry_t ddlwe; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)dspace, (u_longlong_t)mspace); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0) dump_ddt_entry(ddt, &ddlwe, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } dump_ddt_log(ddt); } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); + + /* + * Dump a histogram of unique class entry age + */ + if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { + ddt_age_histo_t histogram; + + (void) printf("DDT walk unique, building age histogram...\n"); + ddt_prune_walk(spa, 0, &histogram); + + /* + * print out histogram for unique entry class birth + */ + if (histogram.dah_entries > 0) { + (void) printf("%5s %9s %4s\n", + "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", + "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9d %4d%%\n", 1 << i, + (int)histogram.dah_age_histo[i], + (int)((histogram.dah_age_histo[i] * 100) / + histogram.dah_entries)); + } + } + } } static void dump_brt(spa_t *spa) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) { printf("BRT: unsupported on this pool\n"); return; } if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { printf("BRT: empty\n"); return; } brt_t *brt = spa->spa_brt; VERIFY(brt); char count[32], used[32], saved[32]; zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); uint64_t ratio = brt_get_ratio(spa); printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved, (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100)); if (dump_opt['T'] < 2) return; for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; if (brtvd == NULL) continue; if (!brtvd->bv_initiated) { printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; } zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n", vdevid, count, used, saved); } if (dump_opt['T'] < 3) return; char dva[64]; printf("\n%-16s %-10s\n", "DVA", "REFCNT"); for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; if (brtvd == NULL || !brtvd->bv_initiated) continue; zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t refcnt; VERIFY0(zap_lookup_uint64(brt->brt_mos, brtvd->bv_mos_entries, (const uint64_t *)za.za_name, 1, za.za_integer_length, za.za_num_integers, &refcnt)); uint64_t offset = *(const uint64_t *)za.za_name; snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, (u_longlong_t)offset); printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); } zap_cursor_fini(&zc); } } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf; uint64_t resid, len, off = 0; uint_t num = 0; int error; char tbuf[30]; if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", __func__); return; } do { len = SPA_OLD_MAXBLOCKSIZE; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); free(buf); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; } while (len != 0); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { boolean_t printed = B_FALSE; if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } else { tbuf[0] = '\0'; } if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { (void) printf("%s %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { uint64_t ievent; ievent = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) printf(" %s [internal %s txg:%ju] %s\n", tbuf, zfs_history_event_names[ievent], fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { (void) printf("%s [txg:%ju] %s", tbuf, fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_NAME)); if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(events[i], ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64( events[i], ZPOOL_HIST_DSID)); } (void) printf(" %s\n", fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_IOCTL)); if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(events[i], ZPOOL_HIST_ERRNO)); } } else { goto next; } printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } free(buf); } static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, const blkptr_t *bp) { static abd_t *pabd = NULL; void *buf; zio_t *zio; zfs_zstdhdr_t zstd_hdr; int error; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) return; if (BP_IS_HOLE(bp)) return; if (BP_IS_EMBEDDED(bp)) { buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); zdb_exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); free(buf); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); return; } if (!pabd) pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); zio = zio_root(spa, NULL, NULL, 0); /* Decrypt but don't decompress so we can read the compression header */ zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, NULL)); error = zio_wait(zio); if (error) { (void) fprintf(stderr, "read failed: %d\n", error); return; } buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:NORMAL", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; int i; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); if (bp_freed) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); } return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); return; } blkbuf[0] = '\0'; for (i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), (u_longlong_t)BP_GET_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " cksum=%016llx:%016llx:%016llx:%016llx", (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); } } static void print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (BP_GET_LOGICAL_BIRTH(bp) == 0) return (0); print_indirect(spa, bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (int j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object; dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated"); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object; dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated"); _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ, "compressed truncated"); _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ, "uncompressed truncated"); _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated"); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; if (BP_GET_LOGICAL_BIRTH(bp) != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; uint64_t i; /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu freed, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } else { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu blkptrs, " "%llu freed, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, bytes); } else { (void) printf(" %*s: object %llu, %llu blkptrs, " "%s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static int dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, boolean_t print_list) { int err = 0; zfs_bookmark_phys_t prop; objset_t *mos = dp->dp_spa->spa_meta_objset; err = dsl_bookmark_lookup(dp, name, NULL, &prop); if (err != 0) { return (err); } (void) printf("\t#%s: ", strchr(name, '#') + 1); (void) printf("{guid: %llx creation_txg: %llu creation_time: " "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, (u_longlong_t)prop.zbm_creation_txg, (u_longlong_t)prop.zbm_creation_time, (u_longlong_t)prop.zbm_redaction_obj); IMPLY(print_list, print_redact); if (!print_redact || prop.zbm_redaction_obj == 0) return (0); redaction_list_t *rl; VERIFY0(dsl_redaction_list_hold_obj(dp, prop.zbm_redaction_obj, FTAG, &rl)); redaction_list_phys_t *rlp = rl->rl_phys; (void) printf("\tRedacted:\n\t\tProgress: "); if (rlp->rlp_last_object != UINT64_MAX || rlp->rlp_last_blkid != UINT64_MAX) { (void) printf("%llu %llu (incomplete)\n", (u_longlong_t)rlp->rlp_last_object, (u_longlong_t)rlp->rlp_last_blkid); } else { (void) printf("complete\n"); } (void) printf("\t\tSnapshots: ["); for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { if (i > 0) (void) printf(", "); (void) printf("%0llu", (u_longlong_t)rlp->rlp_snaps[i]); } (void) printf("]\n\t\tLength: %llu\n", (u_longlong_t)rlp->rlp_num_entries); if (!print_list) { dsl_redaction_list_rele(rl, FTAG); return (0); } if (rlp->rlp_num_entries == 0) { dsl_redaction_list_rele(rl, FTAG); (void) printf("\t\tRedaction List: []\n\n"); return (0); } redact_block_phys_t *rbp_buf; uint64_t size; dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); size = doi.doi_max_offset; rbp_buf = kmem_alloc(size, KM_SLEEP); err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, rbp_buf, 0); if (err != 0) { dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); return (err); } (void) printf("\t\tRedaction List: [{object: %llx, offset: " "%llx, blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[0].rbp_object, (u_longlong_t)rbp_buf[0].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[0])), (u_longlong_t)redact_block_get_count(&rbp_buf[0])); for (size_t i = 1; i < rlp->rlp_num_entries; i++) { (void) printf(",\n\t\t{object: %llx, offset: %llx, " "blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[i].rbp_object, (u_longlong_t)rbp_buf[i].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[i])), (u_longlong_t)redact_block_get_count(&rbp_buf[i])); } dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); (void) printf("]\n\n"); return (0); } static void dump_bookmarks(objset_t *os, int verbosity) { zap_cursor_t zc; zap_attribute_t attr; dsl_dataset_t *ds = dmu_objset_ds(os); dsl_pool_t *dp = spa_get_dsl(os->os_spa); objset_t *mos = os->os_spa->spa_meta_objset; if (verbosity < 4) return; dsl_pool_config_enter(dp, FTAG); for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { char osname[ZFS_MAX_DATASET_NAME_LEN]; char buf[ZFS_MAX_DATASET_NAME_LEN]; int len; dmu_objset_name(os, osname); len = snprintf(buf, sizeof (buf), "%s#%s", osname, attr.za_name); VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN); (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); } zap_cursor_fini(&zc); dsl_pool_config_exit(dp, FTAG); } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static int dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) { spa_t *spa = arg; uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); return (0); } static int dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) { ASSERT(arg == NULL); if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } return (0); } static void dump_blkptr_list(dsl_deadlist_t *dl, const char *name) { char bytes[32]; char comp[32]; char uncomp[32]; char entries[32]; spa_t *spa = dmu_objset_spa(dl->dl_os); uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated"); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); (void) printf("\n %s: %s (%s/%s comp), %s entries\n", name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) putchar('\n'); dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); } static int verify_dd_livelist(objset_t *os) { uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; dsl_pool_t *dp = spa_get_dsl(os->os_spa); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; ASSERT(!dmu_objset_is_snapshot(os)); if (!dsl_deadlist_is_open(&dd->dd_livelist)) return (0); /* Iterate through the livelist to check for duplicates */ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, NULL); dsl_pool_config_enter(dp, FTAG); dsl_deadlist_space(&dd->dd_livelist, &ll_used, &ll_comp, &ll_uncomp); dsl_dataset_t *origin_ds; ASSERT(dsl_pool_config_held(dp)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, &used, &comp, &uncomp)); dsl_dataset_rele(origin_ds, FTAG); dsl_pool_config_exit(dp, FTAG); /* * It's possible that the dataset's uncomp space is larger than the * livelist's because livelists do not track embedded block pointers */ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { char nice_used[32], nice_comp[32], nice_uncomp[32]; (void) printf("Discrepancy in space accounting:\n"); zdb_nicenum(used, nice_used, sizeof (nice_used)); zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("dir: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("livelist: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); return (1); } return (0); } static char *key_material = NULL; static boolean_t zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) { uint64_t keyformat, salt, iters; int i; unsigned char c; VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), 1, &keyformat)); switch (keyformat) { case ZFS_KEYFORMAT_HEX: for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) { if (!isxdigit(key_material[i]) || !isxdigit(key_material[i+1])) return (B_FALSE); if (sscanf(&key_material[i], "%02hhx", &c) != 1) return (B_FALSE); key_out[i / 2] = c; } break; case ZFS_KEYFORMAT_PASSPHRASE: VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), sizeof (uint64_t), 1, &salt)); VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), sizeof (uint64_t), 1, &iters)); if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material), ((uint8_t *)&salt), sizeof (uint64_t), iters, WRAPPING_KEY_LEN, key_out) != 1) return (B_FALSE); break; default: fatal("no support for key format %u\n", (unsigned int) keyformat); } return (B_TRUE); } static char encroot[ZFS_MAX_DATASET_NAME_LEN]; static boolean_t key_loaded = B_FALSE; static void zdb_load_key(objset_t *os) { dsl_pool_t *dp; dsl_dir_t *dd, *rdd; uint8_t key[WRAPPING_KEY_LEN]; uint64_t rddobj; int err; dp = spa_get_dsl(os->os_spa); dd = os->os_dsl_dataset->ds_dir; dsl_pool_config_enter(dp, FTAG); VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj)); VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd)); dsl_dir_name(rdd, encroot); dsl_dir_rele(rdd, FTAG); if (!zdb_derive_key(dd, key)) fatal("couldn't derive encryption key"); dsl_pool_config_exit(dp, FTAG); ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE); dsl_crypto_params_t *dcp; nvlist_t *crypto_args; crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)key, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(encroot, dcp, B_FALSE); dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err != 0) fatal( "couldn't load encryption key for %s: %s", encroot, err == ZFS_ERR_CRYPTO_NOTSUP ? "crypto params not supported" : strerror(err)); ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE); printf("Unlocked encryption root: %s\n", encroot); key_loaded = B_TRUE; } static void zdb_unload_key(void) { if (!key_loaded) return; VERIFY0(spa_keystore_unload_wkey(encroot)); key_loaded = B_FALSE; } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, const void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); /* * We can't own an objset if it's redacted. Therefore, we do this * dance: hold the objset, then acquire a long hold on its dataset, then * release the pool (which is held as part of holding the objset). */ if (dump_opt['K']) { /* decryption requested, try to load keys */ err = dmu_objset_hold(path, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset " "'%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); /* succeeds or dies */ zdb_load_key(*osp); /* release it all */ dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele(dmu_objset_ds(*osp), tag); } int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0; err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && (key_loaded || !(*osp)->os_encrypted)) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele_flags(dmu_objset_ds(*osp), ds_hold_flags, tag); *osp = NULL; } } sa_os = *osp; return (err); } static void close_objset(objset_t *os, const void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dsl_dataset_long_rele(dmu_objset_ds(os), tag); dsl_dataset_rele_flags(dmu_objset_ds(os), key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag); sa_attr_table = NULL; sa_os = NULL; zdb_unload_key(); } static void fuid_table_destroy(void) { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on * a live pool are normally cleaned up during ddt_sync(). We can't do that (and * wouldn't want to anyway), but if we don't clean up the presence of stuff on * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. * * Note that this is not a particularly efficient way to do this, but * ddt_remove() is the only public method that can do the work we need, and it * requires the right locks and etc to do the job. This is only ever called * during zdb shutdown so efficiency is not especially important. */ static void zdb_ddt_cleanup(spa_t *spa) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (!ddt) continue; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ddt_enter(ddt); ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; while (dde) { next = AVL_NEXT(&ddt->ddt_tree, dde); dde->dde_io = NULL; ddt_remove(ddt, dde); dde = next; } ddt_exit(ddt); spa_config_exit(spa, SCL_CONFIG, FTAG); } } static void zdb_exit(int reason) { if (spa != NULL) zdb_ddt_cleanup(spa); if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { spa_close(spa, FTAG); } fuid_table_destroy(); if (kernel_init_done) kernel_fini(); exit(reason); } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { const char *domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } static void dump_znode_sa_xattr(sa_handle_t *hdl) { nvlist_t *sa_xattr; nvpair_t *elem = NULL; int sa_xattr_size = 0; int sa_xattr_entries = 0; int error; char *sa_xattr_packed; error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); if (error || sa_xattr_size == 0) return; sa_xattr_packed = malloc(sa_xattr_size); if (sa_xattr_packed == NULL) return; error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], sa_xattr_packed, sa_xattr_size); if (error) { free(sa_xattr_packed); return; } error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); if (error) { free(sa_xattr_packed); return; } while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) sa_xattr_entries++; (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { boolean_t can_print = !dump_opt['P']; uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); for (idx = 0; idx < cnt; ++idx) { if (!isprint(value[idx])) { can_print = B_FALSE; break; } } for (idx = 0; idx < cnt; ++idx) { if (can_print) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); } (void) putchar('\n'); } nvlist_free(sa_xattr); free(sa_xattr_packed); } static void dump_znode_symlink(sa_handle_t *hdl) { int sa_symlink_size = 0; char linktarget[MAXPATHLEN]; int error; error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); if (error || sa_symlink_size == 0) { return; } if (sa_symlink_size >= sizeof (linktarget)) { (void) printf("symlink size %d is too large\n", sa_symlink_size); return; } linktarget[sa_symlink_size] = '\0'; if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], &linktarget, sa_symlink_size) == 0) (void) printf("\ttarget %s\n", linktarget); } static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { (void) data, (void) size; char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } if (S_ISLNK(mode)) dump_znode_symlink(hdl); dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { uint64_t projid; if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, sizeof (uint64_t)) == 0) (void) printf("\tprojid %llu\n", (u_longlong_t)projid); } if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); dump_znode_sa_xattr(hdl); sa_handle_destroy(hdl); } static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { (void) os, (void) object, (void) data, (void) size; } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group/project used */ dump_zap, /* ZFS user/group/project quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static boolean_t match_object_type(dmu_object_type_t obj_type, uint64_t flags) { boolean_t match = B_TRUE; switch (obj_type) { case DMU_OT_DIRECTORY_CONTENTS: if (!(flags & ZOR_FLAG_DIRECTORY)) match = B_FALSE; break; case DMU_OT_PLAIN_FILE_CONTENTS: if (!(flags & ZOR_FLAG_PLAIN_FILE)) match = B_FALSE; break; case DMU_OT_SPACE_MAP: if (!(flags & ZOR_FLAG_SPACE_MAP)) match = B_FALSE; break; default: if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { if (!(flags & ZOR_FLAG_ZAP)) match = B_FALSE; break; } /* * If all bits except some of the supported flags are * set, the user combined the all-types flag (A) with * a negated flag to exclude some types (e.g. A-f to * show all object types except plain files). */ if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) match = B_FALSE; break; } return (match); } static void dump_object(objset_t *os, uint64_t object, int verbosity, boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; boolean_t dnode_held = B_FALSE; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated"); _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated"); _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ, "bonus_size truncated"); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); dmu_object_info_from_dnode(dn, &doi); } else { /* * Encrypted datasets will have sensitive bonus buffers * encrypted. Therefore we cannot hold the bonus buffer and * must hold the dnode itself instead. */ error = dmu_object_info(os, object, &doi); if (error) fatal("dmu_object_info() failed, errno %u", error); if (!key_loaded && os->os_encrypted && DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { error = dnode_hold(os, object, FTAG, &dn); if (error) fatal("dnode_hold() failed, errno %u", error); dnode_held = B_TRUE; } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } } /* * Default to showing all object types if no flags were specified. */ if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && !match_object_type(doi.doi_type, flags)) goto out; if (dnode_slots_used) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress == ZIO_COMPRESS_INHERIT && ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { const char *compname = NULL; if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), &compname) == 0) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", compname); } else { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s-unknown)", ZDB_COMPRESS_NAME(os->os_compress)); } } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", zdb_ot_name(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? "USEROBJUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); if (!dnode_held) { object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); } else { (void) printf("\t\t(bonus encrypted)\n"); } if (key_loaded || (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); } else { (void) printf("\t\t(object encrypted)\n"); } *print_header = B_TRUE; } if (verbosity >= 5) { if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE); (void) printf("\nSpill block: %s\n", blkbuf); } dump_indirect(dn); } if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ, "segsize truncated"); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } out: if (db != NULL) dmu_buf_rele(db, FTAG); if (dnode_held) dnode_rele(dn, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); /* * The dd_crypto_obj can be referenced by multiple dsl_dir's. * Ignore the references after the first one. */ mos_obj_refd_multiple(dd->dd_crypto_obj); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); mos_obj_refd(ds->ds_bookmarks_obj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *const objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; /* * Parse a string denoting a range of object IDs of the form * [:[:flags]], and store the results in zor. * Return 0 on success. On error, return 1 and update the msg * pointer to point to a descriptive error message. */ static int parse_object_range(char *range, zopt_object_range_t *zor, const char **msg) { uint64_t flags = 0; char *p, *s, *dup, *flagstr, *tmp = NULL; size_t len; int i; int rc = 0; if (strchr(range, ':') == NULL) { zor->zor_obj_start = strtoull(range, &p, 0); if (*p != '\0') { *msg = "Invalid characters in object ID"; rc = 1; } zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = zor->zor_obj_start; return (rc); } if (strchr(range, ':') == range) { *msg = "Invalid leading colon"; rc = 1; return (rc); } len = strlen(range); if (range[len - 1] == ':') { *msg = "Invalid trailing colon"; rc = 1; return (rc); } dup = strdup(range); s = strtok_r(dup, ":", &tmp); zor->zor_obj_start = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in start object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); zor->zor_obj_end = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in end object ID"; rc = 1; goto out; } if (zor->zor_obj_start > zor->zor_obj_end) { *msg = "Start object ID may not exceed end object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); if (s == NULL) { zor->zor_flags = ZOR_FLAG_ALL_TYPES; goto out; } else if (strtok_r(NULL, ":", &tmp) != NULL) { *msg = "Invalid colon-delimited field after flags"; rc = 1; goto out; } flagstr = s; for (i = 0; flagstr[i]; i++) { int bit; boolean_t negation = (flagstr[i] == '-'); if (negation) { i++; if (flagstr[i] == '\0') { *msg = "Invalid trailing negation operator"; rc = 1; goto out; } } bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { *msg = "Invalid flag"; rc = 1; goto out; } if (negation) flags &= ~bit; else flags |= bit; } zor->zor_flags = flags; zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); out: free(dup); return (rc); } static void dump_objset(objset_t *os) { dmu_objset_stats_t dds = { 0 }; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; uint64_t obj_start; uint64_t obj_end; uint64_t flags; /* make sure nicenum has enough space */ _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated"); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); print_header = B_TRUE; if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); for (i = 0; i < zopt_object_args; i++) { obj_start = zopt_object_ranges[i].zor_obj_start; obj_end = zopt_object_ranges[i].zor_obj_end; flags = zopt_object_ranges[i].zor_flags; object = obj_start; if (object == 0 || obj_start == obj_end) dump_object(os, object, verbosity, &print_header, NULL, flags); else object--; while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && object <= obj_end) { dump_object(os, object, verbosity, &print_header, NULL, flags); } } if (zopt_object_args > 0) { (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); if (verify_dd_livelist(os) != 0) fatal("livelist is incorrect"); } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } if (dmu_objset_ds(os) != NULL) dump_bookmarks(os, verbosity); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header, NULL, 0); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, NULL, 0); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, NULL, 0); } if (DMU_PROJECTUSED_DNODE(os) != NULL && DMU_PROJECTUSED_DNODE(os)->dn_type != 0) dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, &print_header, NULL, 0); object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header, &dnode_slots, 0); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); (void) printf(" Dnode slots:\n"); (void) printf("\tTotal used: %10llu\n", (u_longlong_t)total_slots_used); (void) printf("\tMax used: %10llu\n", (u_longlong_t)max_slot_used); (void) printf("\tPercent empty: %10lf\n", (double)(max_slot_used - total_slots_used)*100 / (double)max_slot_used); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } ASSERT3U(object_count, ==, usedobjs); if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { (void) printf("\tmmp_delay = %0llu\n", (u_longlong_t)ub->ub_mmp_delay); if (MMP_SEQ_VALID(ub)) (void) printf("\tmmp_seq = %u\n", (unsigned int) MMP_SEQ(ub)); if (MMP_FAIL_INT_VALID(ub)) (void) printf("\tmmp_fail = %u\n", (unsigned int) MMP_FAIL_INT(ub)); if (MMP_INTERVAL_VALID(ub)) (void) printf("\tmmp_write = %u\n", (unsigned int) MMP_INTERVAL(ub)); /* After MMP_* to make summarize_uberblock_mmp cleaner */ (void) printf("\tmmp_valid = %x\n", (unsigned int) ub->ub_mmp_config & 0xFF); } if (dump_opt['u'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("\traidz_reflow state=%u off=%llu\n", (int)RRSS_GET_STATE(ub), (u_longlong_t)RRSS_GET_OFFSET(ub)); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); zdb_exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); zdb_exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); zdb_exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); zdb_exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); zdb_exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } /* * ZFS label nvlist stats */ typedef struct zdb_nvl_stats { int zns_list_count; int zns_leaf_count; size_t zns_leaf_largest; size_t zns_leaf_total; nvlist_t *zns_string; nvlist_t *zns_uint64; nvlist_t *zns_boolean; } zdb_nvl_stats_t; static void collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) { nvlist_t *list, **array; nvpair_t *nvp = NULL; const char *name; uint_t i, items; stats->zns_list_count++; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { name = nvpair_name(nvp); switch (nvpair_type(nvp)) { case DATA_TYPE_STRING: fnvlist_add_string(stats->zns_string, name, fnvpair_value_string(nvp)); break; case DATA_TYPE_UINT64: fnvlist_add_uint64(stats->zns_uint64, name, fnvpair_value_uint64(nvp)); break; case DATA_TYPE_BOOLEAN: fnvlist_add_boolean(stats->zns_boolean, name); break; case DATA_TYPE_NVLIST: if (nvpair_value_nvlist(nvp, &list) == 0) collect_nvlist_stats(list, stats); break; case DATA_TYPE_NVLIST_ARRAY: if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) break; for (i = 0; i < items; i++) { collect_nvlist_stats(array[i], stats); /* collect stats on leaf vdev */ if (strcmp(name, "children") == 0) { size_t size; (void) nvlist_size(array[i], &size, NV_ENCODE_XDR); stats->zns_leaf_total += size; if (size > stats->zns_leaf_largest) stats->zns_leaf_largest = size; stats->zns_leaf_count++; } } break; default: (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); } } } static void dump_nvlist_stats(nvlist_t *nvl, size_t cap) { zdb_nvl_stats_t stats = { 0 }; size_t size, sum = 0, total; size_t noise; /* requires nvlist with non-unique names for stat collection */ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); (void) printf("\n\nZFS Label NVList Config Stats:\n"); VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", (int)total, (int)(cap - total), 100.0 * total / cap); collect_nvlist_stats(nvl, &stats); VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", (int)fnvlist_num_pairs(stats.zns_uint64), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", (int)fnvlist_num_pairs(stats.zns_string), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", (int)fnvlist_num_pairs(stats.zns_boolean), (int)size, 100.0 * size / total); size = total - sum; /* treat remainder as nvlist overhead */ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", stats.zns_list_count, (int)size, 100.0 * size / total); if (stats.zns_leaf_count > 0) { size_t average = stats.zns_leaf_total / stats.zns_leaf_count; (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", stats.zns_leaf_count, (int)average); (void) printf("%24d bytes largest\n", (int)stats.zns_leaf_largest); if (dump_opt['l'] >= 3 && average > 0) (void) printf(" space for %d additional leaf vdevs\n", (int)((cap - total) / average)); } (void) printf("\n"); nvlist_free(stats.zns_string); nvlist_free(stats.zns_uint64); nvlist_free(stats.zns_boolean); } typedef struct cksum_record { zio_cksum_t cksum; boolean_t labels[VDEV_LABELS]; avl_node_t link; } cksum_record_t; static int cksum_record_compare(const void *x1, const void *x2) { const cksum_record_t *l = (cksum_record_t *)x1; const cksum_record_t *r = (cksum_record_t *)x2; int arraysize = ARRAY_SIZE(l->cksum.zc_word); int difference = 0; for (int i = 0; i < arraysize; i++) { difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } return (difference); } static cksum_record_t * cksum_record_alloc(zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); rec->cksum = *cksum; rec->labels[l] = B_TRUE; return (rec); } static cksum_record_t * cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) { cksum_record_t lookup = { .cksum = *cksum }; avl_index_t where; return (avl_find(tree, &lookup, &where)); } static cksum_record_t * cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = cksum_record_lookup(tree, cksum); if (rec) { rec->labels[l] = B_TRUE; } else { rec = cksum_record_alloc(cksum, l); avl_add(tree, rec); } return (rec); } static int first_label(cksum_record_t *rec) { for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i]) return (i); return (-1); } static void print_label_numbers(const char *prefix, const cksum_record_t *rec) { fputs(prefix, stdout); for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i] == B_TRUE) printf("%d ", i); putchar('\n'); } #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) typedef struct zdb_label { vdev_label_t label; uint64_t label_offset; nvlist_t *config_nv; cksum_record_t *config; cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; boolean_t header_printed; boolean_t read_failed; boolean_t cksum_valid; } zdb_label_t; static void print_label_header(zdb_label_t *label, int l) { if (dump_opt['q']) return; if (label->header_printed == B_TRUE) return; (void) printf("------------------------------------\n"); (void) printf("LABEL %d %s\n", l, label->cksum_valid ? "" : "(Bad label cksum)"); (void) printf("------------------------------------\n"); label->header_printed = B_TRUE; } static void print_l2arc_header(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device header\n"); (void) printf("------------------------------------\n"); } static void print_l2arc_log_blocks(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device log blocks\n"); (void) printf("------------------------------------\n"); } static void dump_l2arc_log_entries(uint64_t log_entries, l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " "vdev: %llu, offset: %llu\n", (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); (void) printf("|\t\t\t\tbirth: %llu\n", (u_longlong_t)le[j].le_birth); (void) printf("|\t\t\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tpsize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tcompr: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); (void) printf("|\t\t\t\tcomplevel: %llu\n", (u_longlong_t)(&le[j])->le_complevel); (void) printf("|\t\t\t\ttype: %llu\n", (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); (void) printf("|\t\t\t\tprotected: %llu\n", (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); (void) printf("|\t\t\t\tprefetch: %llu\n", (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); (void) printf("|\t\t\t\taddress: %llu\n", (u_longlong_t)le[j].le_daddr); (void) printf("|\t\t\t\tARC state: %llu\n", (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); (void) printf("|\n"); } (void) printf("\n"); } static void dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps) { (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr); (void) printf("|\t\tpayload_asize: %llu\n", (u_longlong_t)lbps->lbp_payload_asize); (void) printf("|\t\tpayload_start: %llu\n", (u_longlong_t)lbps->lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop)); (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop)); (void) printf("|\t\tcksumalgo: %llu\n", (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop)); (void) printf("|\n\n"); } static void dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; uint64_t asize; l2arc_log_blkptr_t lbps[2]; zio_cksum_t cksum; int failed = 0; l2arc_dev_t dev; if (!dump_opt['q']) print_l2arc_log_blocks(); memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr->dh_evict; dev.l2ad_start = l2dhdr->dh_start; dev.l2ad_end = l2dhdr->dh_end; if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ if (!dump_opt['q']) { (void) printf("No log blocks to read\n"); (void) printf("\n"); } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); } dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); for (;;) { if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { if (!dump_opt['q']) { (void) printf("Error while reading next log " "block\n\n"); } break; } fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; if (!dump_opt['q']) { (void) printf("Invalid cksum\n"); dump_l2arc_log_blkptr(&lbps[0]); } break; } switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { case ZIO_COMPRESS_OFF: break; default: { abd_t *abd = abd_alloc_linear(asize, B_TRUE); abd_copy_from_buf_off(abd, &this_lb, 0, asize); abd_t dabd; abd_get_from_buf_struct(&dabd, &this_lb, sizeof (this_lb)); int err = zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &dabd, asize, sizeof (this_lb), NULL); abd_free(&dabd); abd_free(abd); if (err != 0) { (void) printf("L2ARC block decompression " "failed\n"); goto out; } break; } } if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { if (!dump_opt['q']) (void) printf("Invalid log block magic\n\n"); break; } rebuild->dh_lb_count++; rebuild->dh_lb_asize += asize; if (dump_opt['l'] > 1 && !dump_opt['q']) { (void) printf("lb[%4llu]\tmagic: %llu\n", (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(&lbps[0]); } if (dump_opt['l'] > 2 && !dump_opt['q']) dump_l2arc_log_entries(l2dhdr->dh_log_entries, this_lb.lb_entries, rebuild->dh_lb_count); if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev.l2ad_evict) && !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } out: if (!dump_opt['q']) { (void) printf("log_blk_count:\t %llu with valid cksum\n", (u_longlong_t)rebuild->dh_lb_count); (void) printf("\t\t %d with invalid cksum\n", failed); (void) printf("log_blk_asize:\t %llu\n\n", (u_longlong_t)rebuild->dh_lb_asize); } } static int dump_l2arc_header(int fd) { l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0}; int error = B_FALSE; if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; } else { if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) error = B_TRUE; } if (error) { (void) printf("L2ARC device header not found\n\n"); /* Do not return an error here for backward compatibility */ return (0); } else if (!dump_opt['q']) { print_l2arc_header(); (void) printf(" magic: %llu\n", (u_longlong_t)l2dhdr.dh_magic); (void) printf(" version: %llu\n", (u_longlong_t)l2dhdr.dh_version); (void) printf(" pool_guid: %llu\n", (u_longlong_t)l2dhdr.dh_spa_guid); (void) printf(" flags: %llu\n", (u_longlong_t)l2dhdr.dh_flags); (void) printf(" start_lbps[0]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[0].lbp_daddr); (void) printf(" start_lbps[1]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); (void) printf(" lb_asize_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_asize); (void) printf(" lb_count_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_count); (void) printf(" trim_action_time: %llu\n", (u_longlong_t)l2dhdr.dh_trim_action_time); (void) printf(" trim_state: %llu\n\n", (u_longlong_t)l2dhdr.dh_trim_state); } dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild); /* * The total aligned size of log blocks and the number of log blocks * reported in the header of the device may be less than what zdb * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). * This happens because dump_l2arc_log_blocks() lacks the memory * pressure valve that l2arc_rebuild() has. Thus, if we are on a system * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize * and dh_lb_count will be lower to begin with than what exists on the * device. This is normal and zdb should not exit with an error. The * opposite case should never happen though, the values reported in the * header should never be higher than what dump_l2arc_log_blocks() and * l2arc_rebuild() report. If this happens there is a leak in the * accounting of log blocks. */ if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || l2dhdr.dh_lb_count > rebuild.dh_lb_count) return (1); return (0); } static void dump_config_from_label(zdb_label_t *label, size_t buflen, int l) { if (dump_opt['q']) return; if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) return; print_label_header(label, l); dump_nvlist(label->config_nv, 4); print_label_numbers(" labels = ", label->config); if (dump_opt['l'] >= 2) dump_nvlist_stats(label->config_nv, buflen); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) { vdev_t vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)&label->label + uoff); cksum_record_t *rec = label->uberblocks[i]; if (rec == NULL) { if (dump_opt['u'] >= 2) { print_label_header(label, label_num); (void) printf(" Uberblock[%d] invalid\n", i); } continue; } if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) continue; if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) continue; print_label_header(label, label_num); (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, " Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); print_label_numbers(" labels = ", rec); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) { int err; boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1, retobj)); zfs_fallthrough; case DMU_OT_PLAIN_FILE_CONTENTS: if (retobj != NULL) { *retobj = child_obj; } else { dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); } return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path, uint64_t *retobj) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); close_objset(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path, retobj); close_objset(os, FTAG); return (err); } static int dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) { const char *p = (const char *)buf; ssize_t nwritten; (void) os; (void) arg; /* Write the data out, handling short writes and signals. */ while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { if (nwritten < 0) { if (errno == EINTR) continue; return (errno); } p += nwritten; len -= nwritten; } return (0); } static void dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) { boolean_t embed = B_FALSE; boolean_t large_block = B_FALSE; boolean_t compress = B_FALSE; boolean_t raw = B_FALSE; const char *c; for (c = flagstr; c != NULL && *c != '\0'; c++) { switch (*c) { case 'e': embed = B_TRUE; break; case 'L': large_block = B_TRUE; break; case 'c': compress = B_TRUE; break; case 'w': raw = B_TRUE; break; default: fprintf(stderr, "dump_backup: invalid flag " "'%c'\n", *c); return; } } if (isatty(STDOUT_FILENO)) { fprintf(stderr, "dump_backup: stream cannot be written " "to a terminal\n"); return; } offset_t off = 0; dmu_send_outparams_t out = { .dso_outfunc = dump_backup_bytes, .dso_dryrun = B_FALSE, }; int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, &off, &out); if (err != 0) { fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", strerror(err)); return; } } static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { int err = 0; uint64_t size, readsize, oursize, offset; ssize_t writesize; sa_handle_t *hdl; (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, destfile); VERIFY3P(os, ==, sa_os); if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { (void) printf("Failed to get handle for SA znode\n"); return (err); } if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { (void) sa_handle_destroy(hdl); return (err); } (void) sa_handle_destroy(hdl); (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, size); if (size == 0) { return (EINVAL); } int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd == -1) return (errno); /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); offset = 0; char *buf = kmem_alloc(oursize, KM_NOSLEEP); if (buf == NULL) { (void) close(fd); return (ENOMEM); } while (offset < size) { readsize = MIN(size - offset, 1 << 20); err = dmu_read(os, srcobj, offset, readsize, buf, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(buf, oursize); (void) close(fd); return (err); } if (dump_opt['v'] > 3) { (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 " error=%d\n", offset, readsize, err); } writesize = write(fd, buf, readsize); if (writesize < 0) { err = errno; break; } else if (writesize != readsize) { /* Incomplete write */ (void) fprintf(stderr, "Short write, only wrote %llu of" " %" PRIu64 " bytes, exiting...\n", (u_longlong_t)writesize, readsize); break; } offset += readsize; } (void) close(fd); if (buf != NULL) kmem_free(buf, oursize); return (err); } static boolean_t label_cksum_valid(vdev_label_t *label, uint64_t offset) { zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; zio_cksum_t expected_cksum; zio_cksum_t actual_cksum; zio_cksum_t verifier; zio_eck_t *eck; int byteswap; void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; offset += offsetof(vdev_label_t, vl_vdev_phys); ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); abd_free(abd); if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (B_TRUE); return (B_FALSE); } static int dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; uint64_t psize, ashift, l2cache; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; boolean_t read_l2arc_header = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; int fd; /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(path, dev, sizeof (path)); if (dev[0] != '/' && stat64(path, &statbuf) != 0) { int error; error = zfs_resolve_shortname(dev, path, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(path)) { if (zfs_append_partition(path, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat64(path, &statbuf) != 0)) { (void) printf("failed to find device %s, try " "specifying absolute path instead\n", dev); return (1); } } if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); zdb_exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); zdb_exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) (void) printf("failed to invalidate cache '%s' : %s\n", path, strerror(errno)); avl_create(&config_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); avl_create(&uberblock_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; /* * 1. Read the label from disk * 2. Verify label cksum * 3. Unpack the configuration and insert in config tree. * 4. Traverse all uberblocks and insert in uberblock tree. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; char *buf = label->label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); nvlist_t *config; cksum_record_t *rec; zio_cksum_t cksum; vdev_t vd; label->label_offset = vdev_label_offset(psize, l, 0); if (pread64(fd, &label->label, sizeof (label->label), label->label_offset) != sizeof (label->label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); label->read_failed = B_TRUE; error = B_TRUE; continue; } label->read_failed = B_FALSE; label->cksum_valid = label_cksum_valid(&label->label, label->label_offset); if (nvlist_unpack(buf, buflen, &config, 0) == 0) { nvlist_t *vdev_tree = NULL; size_t size; if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; /* If the device is a cache device read the header. */ if (!read_l2arc_header) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && l2cache == POOL_STATE_L2CACHE) { read_l2arc_header = B_TRUE; } } fletcher_4_native_varsize(buf, size, &cksum); rec = cksum_record_insert(&config_tree, &cksum, l); label->config = rec; label->config_nv = config; config_found = B_TRUE; } else { error = B_TRUE; } vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)label + uoff); if (uberblock_verify(ub)) continue; fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); rec = cksum_record_insert(&uberblock_tree, &cksum, l); label->uberblocks[i] = rec; } } /* * Dump the label and uberblocks. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); if (label->read_failed == B_TRUE) continue; if (label->config_nv) { dump_config_from_label(label, buflen, l); } else { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } if (dump_opt['u']) dump_label_uberblocks(label, ashift, l); nvlist_free(label->config_nv); } /* * Dump the L2ARC header, if existent. */ if (read_l2arc_header) error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); cookie = NULL; while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); avl_destroy(&config_tree); avl_destroy(&uberblock_tree); (void) close(fd); return (config_found == B_FALSE ? 2 : (error == B_TRUE ? 1 : 0)); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; static int dump_one_objset(const char *dsname, void *arg) { (void) arg; int error; objset_t *os; spa_feature_t f; error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); for (f = 0; f < SPA_FEATURES; f++) { if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } for (dsl_bookmark_node_t *dbn = avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); if (dbn->dbn_phys.zbm_redaction_obj != 0) { global_feature_count[ SPA_FEATURE_REDACTION_BOOKMARKS]++; objset_t *mos = os->os_spa->spa_meta_objset; dnode_t *rl; VERIFY0(dnode_hold(mos, dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); if (rl->dn_have_spill) { global_feature_count[ SPA_FEATURE_REDACTION_LIST_SPILL]++; } } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { global_feature_count[SPA_FEATURE_LIVELIST]++; } dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) typedef struct zdb_brt_entry { dva_t zbre_dva; uint64_t zbre_refcount; avl_node_t zbre_node; } zdb_brt_entry_t; typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_clone_asize; uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; uint64_t zcb_psize_total; uint64_t zcb_lsize_total; uint64_t zcb_asize_total; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; avl_tree_t zcb_brt; boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ static boolean_t same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) { vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t ms_shift = vd->vdev_ms_shift; return ((off1 >> ms_shift) == (off2 >> ms_shift)); } /* * Used to simplify reporting of the histogram data. */ typedef struct one_histo { const char *name; uint64_t *count; uint64_t *len; uint64_t cumulative; } one_histo_t; /* * The number of separate histograms processed for psize, lsize and asize. */ #define NUM_HISTO 3 /* * This routine will create a fixed column size output of three different * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M * the count, length and cumulative length of the psize, lsize and * asize blocks. * * All three types of blocks are listed on a single line * * By default the table is printed in nicenumber format (e.g. 123K) but * if the '-P' parameter is specified then the full raw number (parseable) * is printed out. */ static void dump_size_histograms(zdb_cb_t *zcb) { /* * A temporary buffer that allows us to convert a number into * a string using zdb_nicenumber to allow either raw or human * readable numbers to be output. */ char numbuf[32]; /* * Define titles which are used in the headers of the tables * printed by this routine. */ const char blocksize_title1[] = "block"; const char blocksize_title2[] = "size"; const char count_title[] = "Count"; const char length_title[] = "Size"; const char cumulative_title[] = "Cum."; /* * Setup the histogram arrays (psize, lsize, and asize). */ one_histo_t parm_histo[NUM_HISTO]; parm_histo[0].name = "psize"; parm_histo[0].count = zcb->zcb_psize_count; parm_histo[0].len = zcb->zcb_psize_len; parm_histo[0].cumulative = 0; parm_histo[1].name = "lsize"; parm_histo[1].count = zcb->zcb_lsize_count; parm_histo[1].len = zcb->zcb_lsize_len; parm_histo[1].cumulative = 0; parm_histo[2].name = "asize"; parm_histo[2].count = zcb->zcb_asize_count; parm_histo[2].len = zcb->zcb_asize_len; parm_histo[2].cumulative = 0; (void) printf("\nBlock Size Histogram\n"); /* * Print the first line titles */ if (dump_opt['P']) (void) printf("\n%s\t", blocksize_title1); else (void) printf("\n%7s ", blocksize_title1); for (int j = 0; j < NUM_HISTO; j++) { if (dump_opt['P']) { if (j < NUM_HISTO - 1) { (void) printf("%s\t\t\t", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf(" %s", parm_histo[j].name); } } else { if (j < NUM_HISTO - 1) { /* Left aligned strings in the output */ (void) printf("%-7s ", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf("%s", parm_histo[j].name); } } } (void) printf("\n"); /* * Print the second line titles */ if (dump_opt['P']) { (void) printf("%s\t", blocksize_title2); } else { (void) printf("%7s ", blocksize_title2); } for (int i = 0; i < NUM_HISTO; i++) { if (dump_opt['P']) { (void) printf("%s\t%s\t%s\t", count_title, length_title, cumulative_title); } else { (void) printf("%7s%7s%7s", count_title, length_title, cumulative_title); } } (void) printf("\n"); /* * Print the rows */ for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { /* * Print the first column showing the blocksize */ zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); if (dump_opt['P']) { printf("%s", numbuf); } else { printf("%7s:", numbuf); } /* * Print the remaining set of 3 columns per size: * for psize, lsize and asize */ for (int j = 0; j < NUM_HISTO; j++) { parm_histo[j].cumulative += parm_histo[j].len[i]; zdb_nicenum(parm_histo[j].count[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].len[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].cumulative, numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); } (void) printf("\n"); } } static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { int i; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; /* * This flag controls if we will issue a claim for the block while * counting it, to ensure that all blocks are referenced in space maps. * We don't issue claims if we're not doing leak tracking, because it's * expensive if the user isn't interested. We also don't claim the * second or later occurences of cloned or dedup'd blocks, because we * already claimed them the first time. */ boolean_t do_claim = !dump_opt['L']; spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); blkptr_t tempbp; if (BP_GET_DEDUP(bp)) { /* * Dedup'd blocks are special. We need to count them, so we can * later uncount them when reporting leaked space, and we must * only claim them once. * * We use the existing dedup system to track what we've seen. * The first time we see a block, we do a ddt_lookup() to see * if it exists in the DDT. If we're doing leak tracking, we * claim the block at this time. * * Each time we see a block, we reduce the refcount in the * entry by one, and add to the size and count of dedup'd * blocks to report at the end. */ ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); /* * Find the block. This will create the entry in memory, but * we'll know if that happened by its refcount. */ ddt_entry_t *dde = ddt_lookup(ddt, bp); /* - * ddt_lookup() can only return NULL if this block didn't exist + * ddt_lookup() can return NULL if this block didn't exist * in the DDT and creating it would take the DDT over its * quota. Since we got the block from disk, it must exist in - * the DDT, so this can't happen. + * the DDT, so this can't happen. However, when unique entries + * are pruned, the dedup bit can be set with no corresponding + * entry in the DDT. */ - VERIFY3P(dde, !=, NULL); + if (dde == NULL) { + ddt_exit(ddt); + goto skipped; + } /* Get the phys for this variant */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); /* * This entry may have multiple sets of DVAs. We must claim * each set the first time we see them in a real block on disk, * or count them on subsequent occurences. We don't have a * convenient way to track the first time we see each variant, * so we repurpose dde_io as a set of "seen" flag bits. We can * do this safely in zdb because it never writes, so it will * never have a writing zio for this block in that pointer. */ boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v)); if (!seen) dde->dde_io = (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ - VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0); - ddt_phys_decref(dde->dde_phys, v); + if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) + ddt_phys_decref(dde->dde_phys, v); /* * If this entry has a single flat phys, it may have been * extended with additional DVAs at some time in its life. * This block might be from before it was fully extended, and * so have fewer DVAs. * * If this is the first time we've seen this block, and we * claimed it as-is, then we would miss the claim on some * number of DVAs, which would then be seen as leaked. * * In all cases, if we've had fewer DVAs, then the asize would * be too small, and would lead to the pool apparently using * more space than allocated. * * To handle this, we copy the canonical set of DVAs from the * entry back to the block pointer before we claim it. */ if (v == DDT_PHYS_FLAT) { ASSERT3U(BP_GET_BIRTH(bp), ==, ddt_phys_birth(dde->dde_phys, v)); tempbp = *bp; ddt_bp_fill(dde->dde_phys, v, &tempbp, BP_GET_BIRTH(bp)); bp = &tempbp; } if (seen) { /* * The second or later time we see this block, * it's a duplicate and we count it. */ zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); zcb->zcb_dedup_blocks++; /* Already claimed, don't do it again. */ do_claim = B_FALSE; } ddt_exit(ddt); } else if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { /* * Cloned blocks are special. We need to count them, so we can * later uncount them when reporting leaked space, and we must * only claim them once. * * To do this, we keep our own in-memory BRT. For each block * we haven't seen before, we look it up in the real BRT and * if its there, we note it and its refcount then proceed as * normal. If we see the block again, we count it as a clone * and then give it no further consideration. */ zdb_brt_entry_t zbre_search, *zbre; avl_index_t where; zbre_search.zbre_dva = bp->blk_dva[0]; zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); if (zbre == NULL) { /* Not seen before; track it */ uint64_t refcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); if (refcnt > 0) { zbre = umem_zalloc(sizeof (zdb_brt_entry_t), UMEM_NOFAIL); zbre->zbre_dva = bp->blk_dva[0]; zbre->zbre_refcount = refcnt; avl_insert(&zcb->zcb_brt, zbre, where); } } else { /* * Second or later occurrence, count it and take a * refcount. */ zcb->zcb_clone_asize += BP_GET_ASIZE(bp); zcb->zcb_clone_blocks++; zbre->zbre_refcount--; if (zbre->zbre_refcount == 0) { avl_remove(&zcb->zcb_brt, zbre); umem_free(zbre, sizeof (zdb_brt_entry_t)); } /* Already claimed, don't do it again. */ do_claim = B_FALSE; } } +skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; if (same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) { zb->zb_ditto_samevdev++; if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; } break; } } spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } /* * The binning histogram bins by powers of two up to * SPA_MAXBLOCKSIZE rather than creating bins for * every possible blocksize found in the pool. */ int bin = highbit64(BP_GET_PSIZE(bp)) - 1; zcb->zcb_psize_count[bin]++; zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); zcb->zcb_psize_total += BP_GET_PSIZE(bp); bin = highbit64(BP_GET_LSIZE(bp)) - 1; zcb->zcb_lsize_count[bin]++; zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); zcb->zcb_lsize_total += BP_GET_LSIZE(bp); bin = highbit64(BP_GET_ASIZE(bp)) - 1; zcb->zcb_asize_count[bin]++; zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); if (!do_claim) return; VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL))); } static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); abd_free(zio->io_abd); } static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes > max_inflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; uint64_t kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); uint64_t sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated"); zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4"PRIu64"MB/s) " "estimated time remaining: " "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static int load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { spa_vdev_removal_t *svr = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; /* skip vdevs we don't care about */ if (sme->sme_vdev != svr->svr_vdev_id) return (0); vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) range_tree_add(svr->svr_allocd_segs, offset, size); else range_tree_remove(svr->svr_allocd_segs, offset, size); return (0); } static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset, (void) arg; /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return; if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT0(range_tree_space(svr->svr_allocd_segs)); range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT0(range_tree_space(allocs)); if (msp->ms_sm != NULL) VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); } range_tree_destroy(allocs); iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); /* * Clear everything past what has been synced, * because we have not allocated mappings for * it yet. */ range_tree_clear(svr->svr_allocd_segs, vdev_indirect_mapping_max_offset(vim), vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) tx; zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint64_t obsolete_sm_object; uint32_t *counts; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static int count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { int64_t *ualloc_space = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) *ualloc_space += sme->sme_run; else *ualloc_space -= sme->sme_run; return (0); } static int64_t get_unflushed_alloc_space(spa_t *spa) { if (dump_opt['L']) return (0); int64_t ualloc_space = 0; iterate_through_spacemap_logs(spa, count_unflushed_space_cb, &ualloc_space); return (ualloc_space); } static int load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { maptype_t *uic_maptype = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (*uic_maptype == sme->sme_type) range_tree_add(ms->ms_allocatable, offset, size); else range_tree_remove(ms->ms_allocatable, offset, size); return (0); } static void load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) { iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } load_unflushed_to_ms_allocatables(spa, maptype); } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (dump_opt['L']) return; dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); /* * For leak detection, we overload the ms_allocatable trees * to contain allocated segments instead of free segments. * As a result, we can't use the normal metaslab_load/unload * interfaces. */ zdb_leak_init_prepare_indirect_vdevs(spa, zcb); load_concrete_ms_allocatable_trees(spa, SM_ALLOC); /* * On load_concrete_ms_allocatable_trees() we loaded all the * allocated entries from the ms_sm to the ms_allocatable for * each metaslab. If the pool has a checkpoint or is in the * middle of discarding a checkpoint, some of these blocks * may have been freed but their ms_sm may not have been * updated because they are referenced by the checkpoint. In * order to avoid false-positives during leak-detection, we * go through the vdev's checkpoint space map and exclude all * its entries from their relevant ms_allocatable. * * We also aggregate the space held by the checkpoint and add * it to zcb_checkpoint_size. * * Note that at this point we are also verifying that all the * entries on the checkpoint_sm are marked as allocated in * the ms_sm of their relevant metaslab. * [see comment in checkpoint_sm_exclude_entry_cb()] */ zdb_leak_init_exclude_checkpoint(spa, zcb); ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; boolean_t are_precise = B_FALSE; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1ULL << vd->vdev_ashift) { if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1ULL << vd->vdev_ashift)) { obsolete_bytes += 1ULL << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (!are_precise && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return (B_FALSE); boolean_t leaks = B_FALSE; vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == spa_embedded_log_class(spa)) ? vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded * to contain allocated segments. Now that * we finished traversing all blocks, any * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_allocatable. For indirect * vdevs, space remaining in the tree * represents parts of the mapping that are * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } } } umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; return (leaks); } static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { (void) tx; zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } /* * Iterate over livelists which have been destroyed by the user but * are still present in the MOS, waiting to be freed */ static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) { objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; ASSERT0(err); zap_cursor_t zc; zap_attribute_t attr; dsl_deadlist_t ll; /* NULL out os prior to dsl_deadlist_open in case it's garbage */ ll.dl_os = NULL; for (zap_cursor_init(&zc, mos, zap_obj); zap_cursor_retrieve(&zc, &attr) == 0; (void) zap_cursor_advance(&zc)) { dsl_deadlist_open(&ll, mos, attr.za_first_integer); func(&ll, arg); dsl_deadlist_close(&ll); } zap_cursor_fini(&zc); } static int bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (count_block_cb(arg, bp, tx)); } static int livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) { zdb_cb_t *zbc = args; bplist_t blks; bplist_create(&blks); /* determine which blocks have been alloc'd but not freed */ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); /* count those blocks */ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); bplist_destroy(&blks); return (0); } static void livelist_count_blocks(dsl_deadlist_t *ll, void *arg) { dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); } /* * Count the blocks in the livelists that have been destroyed by the user * but haven't yet been freed. */ static void deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) { iterate_deleted_livelists(spa, livelist_count_blocks, zbc); } static void dump_livelist_cb(dsl_deadlist_t *ll, void *arg) { ASSERT3P(arg, ==, NULL); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); } /* * Print out, register object references to, and increment feature counts for * livelists that have been destroyed by the user but haven't yet been freed. */ static void deleted_livelists_dump_mos(spa_t *spa) { uint64_t zap_obj; objset_t *mos = spa->spa_meta_objset; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; mos_obj_refd(zap_obj); iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } static int zdb_brt_entry_compare(const void *zcn1, const void *zcn2) { const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; int cmp; cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (cmp == 0) cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); return (cmp); } static int dump_block_stats(spa_t *spa) { zdb_cb_t *zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; boolean_t leaks = B_FALSE; int e, c, err; bp_embedded_type_t i; ddt_prefetch_all(spa); zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, sizeof (zdb_brt_entry_t), offsetof(zdb_brt_entry_t, zbre_node)); zcb->zcb_brt_is_active = B_TRUE; } (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * When leak detection is enabled we load all space maps as SM_ALLOC * maps, then traverse the pool claiming each block we discover. If * the pool is perfectly consistent, the segment trees will be empty * when we're done. Anything left over is a leak; any block we can't * claim (because it's not part of any space map) is a double * allocation, reference to a freed block, or an unclaimed log block. * * When leak detection is disabled (-L option) we still traverse the * pool claiming each block we discover, but we skip opening any space * maps. */ zdb_leak_init(spa, zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, bpobj_count_block_cb, zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, bpobj_count_block_cb, zcb, NULL); } zdb_claim_removing(spa, zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, zcb, NULL)); } deleted_livelists_count_blocks(spa, zcb); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (c = 0; c < max_ncpus; c++) { (void) zio_wait(spa->spa_async_zio_root[c]); spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } ASSERT0(spa->spa_load_verify_bytes); /* * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ zcb->zcb_haderrors |= err; if (zcb->zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (e = 0; e < 256; e++) { if (zcb->zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb->zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, zcb); tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); } if (tzb->zb_count == 0) { umem_free(zcb, sizeof (zdb_cb_t)); return (2); } (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu count: %6llu\n", "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( spa_special_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Special class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( spa_dedup_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Dedup class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_embedded_log_class(spa)); uint64_t space = metaslab_class_get_space( spa_embedded_log_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Embedded log class", (u_longlong_t)alloc, 100.0 * alloc / space); } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb->zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb->zcb_embedded_histogram[i], sizeof (zcb->zcb_embedded_histogram[i]) / sizeof (zcb->zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (tzb->zb_ditto_same_ms != 0) { (void) printf("\tDittoed blocks in same metaslab: %llu\n", (longlong_t)tzb->zb_ditto_same_ms); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), UMEM_NOFAIL); for (t = 0; t <= ZDB_OT_TOTAL; t++) { const char *typename; /* make sure nicenum has enough space */ _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ, "csize truncated"); _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ, "psize truncated"); _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ, "avg truncated"); _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ, "gang truncated"); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb->zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && (level > 0 || DMU_OT_IS_METADATA(t))) { mdstats->zb_count += zb->zb_count; mdstats->zb_lsize += zb->zb_lsize; mdstats->zb_psize += zb->zb_psize; mdstats->zb_asize += zb->zb_asize; mdstats->zb_gangs += zb->zb_gangs; } if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb->zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } zdb_nicenum(mdstats->zb_count, csize, sizeof (csize)); zdb_nicenum(mdstats->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(mdstats->zb_psize, psize, sizeof (psize)); zdb_nicenum(mdstats->zb_asize, asize, sizeof (asize)); zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, sizeof (avg)); zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)mdstats->zb_lsize / mdstats->zb_psize, 100.0 * mdstats->zb_asize / tzb->zb_asize); (void) printf("%s\n", "Metadata Total"); /* Output a table summarizing block sizes in the pool */ if (dump_opt['b'] >= 2) { dump_size_histograms(zcb); } umem_free(mdstats, sizeof (zfs_blkstat_t)); } (void) printf("\n"); if (leaks) { umem_free(zcb, sizeof (zdb_cb_t)); return (2); } if (zcb->zcb_haderrors) { umem_free(zcb, sizeof (zdb_cb_t)); return (3); } umem_free(zcb, sizeof (zdb_cb_t)); return (0); } typedef struct zdb_ddt_entry { /* key must be first for ddt_key_compare */ ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { (void) zilog, (void) dnp; avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; avl_create(&t, ddt_key_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1]; dds->dds_blocks += zdde->zdde_ref_blocks / refcnt; dds->dds_lsize += zdde->zdde_ref_lsize / refcnt; dds->dds_psize += zdde->zdde_ref_psize / refcnt; dds->dds_dsize += zdde->zdde_ref_dsize / refcnt; dds->dds_ref_blocks += zdde->zdde_ref_blocks; dds->dds_ref_lsize += zdde->zdde_ref_lsize; dds->dds_ref_psize += zdde->zdde_ref_psize; dds->dds_ref_dsize += zdde->zdde_ref_dsize; umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_total(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } static void zdb_set_skip_mmp(char *target) { spa_t *spa; /* * Disable the activity check to allow examination of * active pools. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL) { spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; } mutex_exit(&spa_namespace_lock); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appended to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; boolean_t freecfg = B_FALSE; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { zdb_set_skip_mmp(poolname); error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } freecfg = B_TRUE; } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) { if (target != poolname) free(poolname); return (NULL); } fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); if (freecfg) nvlist_free(cfg); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) { if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { free(bogus_name); if (path_start != NULL) free(poolname); return (NULL); } } if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); range_tree_verify_not_present(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); if (dump_opt['m'] > 3) dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { ASSERT(!dump_opt['L']); spa_t *checkpoint_spa; char *checkpoint_pool; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); if (dump_opt['m'] > 3) dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { (void) arg; for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) range_tree_add(mos_refd_objs, obj, 1); } /* * Call on a MOS object that may already have been referenced. */ static void mos_obj_refd_multiple(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL && !range_tree_contains(mos_refd_objs, obj, 1)) range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev_top_zap(vdev_t *vd) { uint64_t ms_flush_data_obj; int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(ms_flush_data_obj); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } if (vd->vdev_root_zap != 0) mos_obj_refd(vd->vdev_root_zap); if (vd->vdev_top_zap != 0) { mos_obj_refd(vd->vdev_top_zap); mos_leak_vdev_top_zap(vd); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static void mos_leak_log_spacemaps(spa_t *spa) { uint64_t spacemap_zap; int error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(spacemap_zap); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) mos_obj_refd(sls->sls_sm_obj); } static void errorlog_count_refd(objset_t *mos, uint64_t errlog) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, mos, errlog); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { mos_obj_refd(za.za_first_integer); } zap_cursor_fini(&zc); } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { errorlog_count_refd(mos, spa->spa_errlog_last); errorlog_count_refd(mos, spa->spa_errlog_scrub); } mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); if (spa->spa_syncing_log_sm != NULL) mos_obj_refd(spa->spa_syncing_log_sm->sm_object); mos_leak_log_spacemaps(spa); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; /* DDT store objects */ for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } /* FDT container */ - mos_obj_refd(ddt->ddt_dir_object); + if (ddt->ddt_version == DDT_VERSION_FDT) + mos_obj_refd(ddt->ddt_dir_object); /* FDT log objects */ - mos_obj_refd(ddt->ddt_log[0].ddl_object); - mos_obj_refd(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); + } } if (spa->spa_brt != NULL) { brt_t *brt = spa->spa_brt; for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; if (brtvd != NULL && brtvd->bv_initiated) { mos_obj_refd(brtvd->bv_mos_brtvdev); mos_obj_refd(brtvd->bv_mos_entries); } } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (range_tree_contains(mos_refd_objs, object, 1)) { range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; VERIFY0(dmu_object_info(mos, object, &doi)); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!range_tree_is_empty(mos_refd_objs)) rv = 2; range_tree_vacate(mos_refd_objs, NULL, NULL); range_tree_destroy(mos_refd_objs); return (rv); } typedef struct log_sm_obsolete_stats_arg { uint64_t lsos_current_txg; uint64_t lsos_total_entries; uint64_t lsos_valid_entries; uint64_t lsos_sm_entries; uint64_t lsos_valid_sm_entries; } log_sm_obsolete_stats_arg_t; static int log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { log_sm_obsolete_stats_arg_t *lsos = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; if (lsos->lsos_current_txg == 0) { /* this is the first log */ lsos->lsos_current_txg = txg; } else if (lsos->lsos_current_txg < txg) { /* we just changed log - print stats and reset */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos->lsos_valid_sm_entries, (u_longlong_t)lsos->lsos_sm_entries, (u_longlong_t)lsos->lsos_current_txg); lsos->lsos_valid_sm_entries = 0; lsos->lsos_sm_entries = 0; lsos->lsos_current_txg = txg; } ASSERT3U(lsos->lsos_current_txg, ==, txg); lsos->lsos_sm_entries++; lsos->lsos_total_entries++; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); lsos->lsos_valid_sm_entries++; lsos->lsos_valid_entries++; return (0); } static void dump_log_spacemap_obsolete_stats(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; log_sm_obsolete_stats_arg_t lsos = {0}; (void) printf("Log Space Map Obsolete Entry Statistics:\n"); iterate_through_spacemap_logs(spa, log_spacemap_obsolete_stats_cb, &lsos); /* print stats for latest log */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos.lsos_valid_sm_entries, (u_longlong_t)lsos.lsos_sm_entries, (u_longlong_t)lsos.lsos_current_txg); (void) printf("%-8llu valid entries out of %-8llu - total\n\n", (u_longlong_t)lsos.lsos_valid_entries, (u_longlong_t)lsos.lsos_total_entries); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['y']) { livelist_metaslab_validate(spa); } if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['T']) dump_brt(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa, dump_opt['M'] > 1); if (dump_opt['d'] > 2 || dump_opt['m']) { dump_log_spacemaps(spa); dump_log_spacemap_obsolete_stats(spa); } if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; uint64_t *arr; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { if (global_feature_count[f] == UINT64_MAX) continue; if (!spa_feature_is_enabled(spa, f)) { ASSERT0(global_feature_count[f]); continue; } arr = global_feature_count; } else { if (!spa_feature_is_enabled(spa, f)) { ASSERT0(dataset_feature_count[f]); continue; } arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) rc = verify_device_removal_feature_counts(spa); } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); zdb_exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_RAW 0x0020 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 #define ZDB_FLAG_VERBOSE 0x0080 static int flagbits[256]; static char flagbitstr[16]; static void zdb_print_blkptr(const blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); VERIFY(write(fileno(stdout), buf, size) == size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); #ifdef _ZFS_LITTLE_ENDIAN /* correct the endianness */ do_bswap = !do_bswap; #endif for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the hierarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (s && *s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } static int name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) { dsl_dataset_t *ds; dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, NULL, &ds); if (error != 0) { (void) fprintf(stderr, "failed to hold objset %llu: %s\n", (u_longlong_t)objset_id, strerror(error)); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (error); } dsl_dataset_name(ds, outstr); dsl_dataset_rele(ds, NULL); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (0); } static boolean_t zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) { char *s0, *s1, *tmp = NULL; if (sizes == NULL) return (B_FALSE); s0 = strtok_r(sizes, "/", &tmp); if (s0 == NULL) return (B_FALSE); s1 = strtok_r(NULL, "/", &tmp); *lsize = strtoull(s0, NULL, 16); *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; return (*lsize >= *psize && *psize > 0); } #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) static boolean_t try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize, int flags, int cfunc, void *lbuf, void *lbuf2) { if (flags & ZDB_FLAG_VERBOSE) { (void) fprintf(stderr, "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[cfunc].ci_name); } /* * We set lbuf to all zeros and lbuf2 to all * ones, then decompress to both buffers and * compare their contents. This way we can * know if decompression filled exactly to * lsize or if it left some bytes unwritten. */ memset(lbuf, 0x00, lsize); memset(lbuf2, 0xff, lsize); abd_t labd, labd2; abd_get_from_buf_struct(&labd, lbuf, lsize); abd_get_from_buf_struct(&labd2, lbuf2, lsize); boolean_t ret = B_FALSE; if (zio_decompress_data(cfunc, pabd, &labd, psize, lsize, NULL) == 0 && zio_decompress_data(cfunc, pabd, &labd2, psize, lsize, NULL) == 0 && memcmp(lbuf, lbuf2, lsize) == 0) ret = B_TRUE; abd_free(&labd2); abd_free(&labd); return (ret); } static uint64_t zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, uint64_t psize, int flags) { (void) buf; uint64_t orig_lsize = lsize; boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); boolean_t found = B_FALSE; /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; int *cfuncp = cfuncs; uint64_t maxlsize = SPA_MAXBLOCKSIZE; uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | ZIO_COMPRESS_MASK(ZLE); *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); /* * Every gzip level has the same decompressor, no need to * run it 9 times per bruteforce attempt. */ mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3); mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5); mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7); mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9); for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) if (((1ULL << c) & mask) == 0) *cfuncp++ = c; /* * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this * could take a while and we should let the user know * we are not stuck. On the other hand, printing progress * info gets old after a while. User can specify 'v' flag * to see the progression. */ if (lsize == psize) lsize += SPA_MINBLOCKSIZE; else maxlsize = lsize; for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { for (cfuncp = cfuncs; *cfuncp; cfuncp++) { if (try_decompress_block(pabd, lsize, psize, flags, *cfuncp, lbuf, lbuf2)) { found = B_TRUE; break; } } if (*cfuncp != 0) break; } if (!found && tryzle) { for (lsize = orig_lsize; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { if (try_decompress_block(pabd, lsize, psize, flags, ZIO_COMPRESS_ZLE, lbuf, lbuf2)) { *cfuncp = ZIO_COMPRESS_ZLE; found = B_TRUE; break; } } } umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (*cfuncp == ZIO_COMPRESS_ZLE) { printf("\nZLE decompression was selected. If you " "suspect the results are wrong,\ntry avoiding ZLE " "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); } return (lsize > maxlsize ? -1 : lsize); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:[lsize/]psize[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * r: Dump raw data to stdout * v: Verbose * */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL; const char *vdev, *errmsg = NULL; int i, error; boolean_t borrowed = B_FALSE, found = B_FALSE; dup = strdup(thing); s = strtok_r(dup, ":", &tmp); vdev = s ?: ""; s = strtok_r(NULL, ":", &tmp); offset = strtoull(s ? s : "", NULL, 16); sizes = strtok_r(NULL, ":", &tmp); s = strtok_r(NULL, ":", &tmp); flagstr = strdup(s ?: ""); if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) errmsg = "invalid size(s)"; if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) errmsg = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) errmsg = "offset must be a multiple of sector size"; if (errmsg) { (void) printf("Invalid block specifier: %s - %s\n", thing, errmsg); goto done; } tmp = NULL; for (s = strtok_r(flagstr, ":", &tmp); s != NULL; s = strtok_r(NULL, ":", &tmp)) { for (i = 0; i < strlen(flagstr); i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Ignoring flag: %c\n", (uchar_t)flagstr[i]); continue; } found = B_TRUE; flags |= bit; p = &flagstr[i + 1]; if (*p != ':' && *p != '\0') { int j = 0, nextbit = flagbits[(uchar_t)*p]; char *end, offstr[8] = { 0 }; if ((bit == ZDB_FLAG_PRINT_BLKPTR) && (nextbit == 0)) { /* look ahead to isolate the offset */ while (nextbit == 0 && strchr(flagbitstr, *p) == NULL) { offstr[j] = *p; j++; if (i + j > strlen(flagstr)) break; p++; nextbit = flagbits[(uchar_t)*p]; } blkptr_offset = strtoull(offstr, &end, 16); i += j; } else if (nextbit == 0) { (void) printf("***Ignoring flag arg:" " '%c'\n", (uchar_t)*p); } } } } if (blkptr_offset % sizeof (blkptr_t)) { printf("Block pointer offset 0x%llx " "must be divisible by 0x%x\n", (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); goto done; } if (found == B_FALSE && strlen(flagstr) > 0) { printf("Invalid flag arg: '%s'\n", flagstr); goto done; } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); goto done; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } uint64_t orig_lsize = lsize; buf = lbuf; if (flags & ZDB_FLAG_DECOMPRESS) { lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); if (lsize == -1) { (void) printf("Decompress of %s failed\n", thing); goto out; } } else { buf = abd_borrow_buf_copy(pabd, lsize); borrowed = B_TRUE; } /* * Try to detect invalid block pointer. If invalid, try * decompressing. */ if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; lsize = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (lsize == -1 || zfs_blkptr_verify(spa, b, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } } } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, lsize, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, lsize, flags); /* * If :c was specified, iterate through the checksum table to * calculate and display each checksum for our specified * DVA and length. */ if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && !(flags & ZDB_FLAG_GBH)) { zio_t *czio; (void) printf("\n"); for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { if ((zio_checksum_table[ck].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) || ck == ZIO_CHECKSUM_NOPARITY) { continue; } BP_SET_CHECKSUM(bp, ck); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); if (vd == vd->vdev_top) { zio_nowait(zio_read(czio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_DONT_RETRY, NULL)); } else { zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(czio); if (error == 0 || error == ECKSUM) { zio_t *ck_zio = zio_null(NULL, spa, NULL, NULL, NULL, 0); ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; zio_checksum_compute(ck_zio, ck, pabd, lsize); printf( "%12s\t" "cksum=%016llx:%016llx:%016llx:%016llx\n", zio_checksum_table[ck].ci_name, (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); zio_wait(ck_zio); } else { printf("error %d reading block\n", error); } spa_config_exit(spa, SCL_STATE, FTAG); } } if (borrowed) abd_return_buf_copy(pabd, buf, lsize); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); done: free(flagstr); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp = {{{{0}}}}; unsigned long long *words = (void *)&bp; char *buf; int err; err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); zdb_exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); zdb_exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); zdb_exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } /* check for valid hex or decimal numeric string */ static boolean_t zdb_numeric(char *str) { int i = 0; if (strlen(str) == 0) return (B_FALSE); if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) i = 2; for (; i < strlen(str); i++) { if (!isxdigit(str[i])) return (B_FALSE); } return (B_TRUE); } static int dummy_get_file_info(dmu_object_type_t bonustype, const void *data, zfs_file_info_t *zoi) { (void) data, (void) zoi; if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (ENOENT); (void) fprintf(stderr, "dummy_get_file_info: not implemented"); abort(); } int main(int argc, char **argv) { int c; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int64_t objset_id = -1; uint64_t object; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; struct sigaction action; boolean_t force_import = B_FALSE; boolean_t config_path_console = B_FALSE; char pbuf[MAXPATHLEN]; dprintf_setup(&argc, argv); /* * Set up signal handlers, so if we crash due to bad on-disk data we * can get more info. Unlike ztest, we don't bail out if we can't set * up signal handlers, because zdb is very useful without them. */ action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", strerror(errno)); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", strerror(errno)); } /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; /* * For performance reasons, we set this tunable down. We do so before * the arg parsing section so that the user can override this value if * they choose. */ zfs_btree_verify_intensity = 3; struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, {"dedup-stats", no_argument, NULL, 'D'}, {"exported", no_argument, NULL, 'e'}, {"embedded-block-pointer", no_argument, NULL, 'E'}, {"automatic-rewind", no_argument, NULL, 'F'}, {"dump-debug-msg", no_argument, NULL, 'G'}, {"history", no_argument, NULL, 'h'}, {"intent-logs", no_argument, NULL, 'i'}, {"inflight", required_argument, NULL, 'I'}, {"checkpointed-state", no_argument, NULL, 'k'}, {"key", required_argument, NULL, 'K'}, {"label", no_argument, NULL, 'l'}, {"disable-leak-tracking", no_argument, NULL, 'L'}, {"metaslabs", no_argument, NULL, 'm'}, {"metaslab-groups", no_argument, NULL, 'M'}, {"numeric", no_argument, NULL, 'N'}, {"option", required_argument, NULL, 'o'}, {"object-lookups", no_argument, NULL, 'O'}, {"path", required_argument, NULL, 'p'}, {"parseable", no_argument, NULL, 'P'}, {"skip-label", no_argument, NULL, 'q'}, {"copy-object", no_argument, NULL, 'r'}, {"read-block", no_argument, NULL, 'R'}, {"io-stats", no_argument, NULL, 's'}, {"simulate-dedup", no_argument, NULL, 'S'}, {"txg", required_argument, NULL, 't'}, {"brt-stats", no_argument, NULL, 'T'}, {"uberblock", no_argument, NULL, 'u'}, {"cachefile", required_argument, NULL, 'U'}, {"verbose", no_argument, NULL, 'v'}, {"verbatim", no_argument, NULL, 'V'}, {"dump-blocks", required_argument, NULL, 'x'}, {"extreme-rewind", no_argument, NULL, 'X'}, {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'N': case 'O': case 'r': case 'R': case 's': case 'S': case 'T': case 'u': case 'y': case 'Z': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; case 'Y': zfs_reconstruct_indirect_combinations_max = INT_MAX; zfs_deadman_enabled = 0; break; /* NB: Sort single match options below. */ case 'I': max_inflight_bytes = strtoull(optarg, NULL, 0); if (max_inflight_bytes == 0) { (void) fprintf(stderr, "maximum number " "of inflight bytes must be greater " "than 0\n"); usage(); } break; case 'K': dump_opt[c]++; key_material = strdup(optarg); /* redact key material in process table */ while (*optarg != '\0') { *optarg++ = '*'; } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); memcpy(tmp, searchdirs, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': config_path_console = B_TRUE; spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } #if defined(_LP64) /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; zfs_arc_max = 256 * 1024 * 1024; #endif /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; /* * ZDB should have ability to read spacemaps. */ spa_mode_readable_spacemaps = B_TRUE; if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); target = argv[0]; /* * Automate cachefile */ if (!spa_config_path_env && !config_path_console && target && libzfs_core_init() == 0) { char *pname = strdup(target); const char *value; nvlist_t *pnvl = NULL; nvlist_t *vnvl = NULL; if (strpbrk(pname, "/@") != NULL) *strpbrk(pname, "/@") = '\0'; if (pname && lzc_get_props(pname, &pnvl) == 0) { if (nvlist_lookup_nvlist(pnvl, "cachefile", &vnvl) == 0) { value = fnvlist_lookup_string(vnvl, ZPROP_VALUE); } else { value = "-"; } strlcpy(pbuf, value, sizeof (pbuf)); if (pbuf[0] != '\0') { if (pbuf[0] == '/') { if (access(pbuf, F_OK) == 0) spa_config_path = pbuf; else force_import = B_TRUE; } else if ((strcmp(pbuf, "-") == 0 && access(ZPOOL_CACHE, F_OK) != 0) || strcmp(pbuf, "none") == 0) { force_import = B_TRUE; } } nvlist_free(vnvl); } free(pname); nvlist_free(pnvl); libzfs_core_fini(); } dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info); kernel_init(SPA_MODE_READ); kernel_init_done = B_TRUE; if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); error = 0; goto fini; } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); error = 0; goto fini; } usage(); } if (dump_opt['l']) { error = dump_label(argv[0]); goto fini; } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); /* -N implies -d */ if (dump_opt['N'] && dump_opt['d'] == 0) dump_opt['d'] = dump_opt['N']; if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_pool = strdup(target); *strpbrk(target_pool, "/@") = '\0'; target_is_spa = B_FALSE; targetlen = strlen(target); if (targetlen && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; /* * See if an objset ID was supplied (-d /). * To disambiguate tank/100, consider the 100 as objsetID * if -N was given, otherwise 100 is an objsetID iff * tank/100 as a named dataset fails on lookup. */ objset_str = strchr(target, '/'); if (objset_str && strlen(objset_str) > 1 && zdb_numeric(objset_str + 1)) { char *endptr; errno = 0; objset_str++; objset_id = strtoull(objset_str, &endptr, 0); /* dataset 0 is the same as opening the pool */ if (errno == 0 && endptr != objset_str && objset_id != 0) { if (dump_opt['N']) dataset_lookup = B_TRUE; } /* normal dataset name not an objset ID */ if (endptr == objset_str) { objset_id = -1; } } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); error = 1; goto fini; } } else { target_pool = target; } if (dump_opt['e'] || force_import) { importargs_t args = { 0 }; /* * If path is not provided, search in /dev */ if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); searchdirs[nsearch++] = (char *)ZFS_DEVDIR; } args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; libpc_handle_t lpch = { .lpc_lib_handle = NULL, .lpc_ops = &libzpool_config_ops, .lpc_printerr = B_TRUE }; error = zpool_find_config(&lpch, target_pool, &cfg, &args); if (error == 0) { if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } /* * Disable the activity check to allow examination of * active pools. */ error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } if (searchdirs != NULL) { umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = NULL; } /* * We need to make sure to process -O option or call * dump_path after the -e option has been processed, * which imports the pool to the namespace if it's * not in the cachefile. */ if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; error = dump_path(argv[0], argv[1], NULL); goto fini; } if (dump_opt['r']) { target_is_spa = B_FALSE; if (argc != 3) usage(); dump_opt['v'] = verbose; error = dump_path(argv[0], argv[1], &object); if (error != 0) fatal("internal error: %s", strerror(error)); } /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa * namespace. Because of that we need to make sure to call * it always after the -e option has been processed, which * imports the pool to the namespace if it's not in the * cachefile. */ char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (cfg != NULL) { nvlist_free(cfg); cfg = NULL; } if (target_pool != target) free(target_pool); if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else if (strpbrk(target, "#") != NULL) { dsl_pool_t *dp; error = dsl_pool_hold(target, FTAG, &dp); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } error = dump_bookmark(dp, target, B_TRUE, verbose > 1); dsl_pool_rele(dp, FTAG); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } goto fini; } else { target_pool = strdup(target); if (strpbrk(target, "/@") != NULL) *strpbrk(target_pool, "/@") = '\0'; zdb_set_skip_mmp(target); /* * If -N was supplied, the user has indicated that * zdb -d / is in effect. Otherwise * we first assume that the dataset string is the * dataset name. If dmu_objset_hold fails with the * dataset string, and we have an objset_id, retry the * lookup with the objsetID. */ boolean_t retry = B_TRUE; retry_lookup: if (dataset_lookup == B_TRUE) { /* * Use the supplied id to get the name * for open_objset. */ error = spa_open(target_pool, &spa, FTAG); if (error == 0) { error = name_from_objset_id(spa, objset_id, dsname); spa_close(spa, FTAG); if (error == 0) target = dsname; } } if (error == 0) { if (objset_id > 0 && retry) { int err = dmu_objset_hold(target, FTAG, &os); if (err) { dataset_lookup = B_TRUE; retry = B_FALSE; goto retry_lookup; } else { dmu_objset_rele(os, FTAG); } } error = open_objset(target, FTAG, &os); } if (error == 0) spa = dmu_objset_spa(os); free(target_pool); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); /* * Set the pool failure mode to panic in order to prevent the pool * from suspending. A suspended I/O will have no way to resume and * can prevent the zdb(8) command from terminating as expected. */ if (spa != NULL) spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; argv++; argc--; if (dump_opt['r']) { error = zdb_copy_object(os, object, argv[1]); } else if (!dump_opt['R']) { flagbits['d'] = ZOR_FLAG_DIRECTORY; flagbits['f'] = ZOR_FLAG_PLAIN_FILE; flagbits['m'] = ZOR_FLAG_SPACE_MAP; flagbits['z'] = ZOR_FLAG_ZAP; flagbits['A'] = ZOR_FLAG_ALL_TYPES; if (argc > 0 && dump_opt['d']) { zopt_object_args = argc; zopt_object_ranges = calloc(zopt_object_args, sizeof (zopt_object_range_t)); for (unsigned i = 0; i < zopt_object_args; i++) { int err; const char *msg = NULL; err = parse_object_range(argv[i], &zopt_object_ranges[i], &msg); if (err != 0) fatal("Bad object or range: '%s': %s\n", argv[i], msg ?: ""); } } else if (argc > 0 && dump_opt['m']) { zopt_metaslab_args = argc; zopt_metaslab = calloc(zopt_metaslab_args, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_metaslab_args; i++) { errno = 0; zopt_metaslab[i] = strtoull(argv[i], NULL, 0); if (zopt_metaslab[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (dump_opt['B']) { dump_backup(target, objset_id, argc > 0 ? argv[0] : NULL); } else if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['r'] = ZDB_FLAG_RAW; flagbits['v'] = ZDB_FLAG_VERBOSE; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } fini: if (spa != NULL) zdb_ddt_cleanup(spa); if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { spa_close(spa, FTAG); } fuid_table_destroy(); dump_debug_buffer(); if (kernel_init_done) kernel_fini(); return (error); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9cd26a8650ad..ce859226c215 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1,13563 +1,13652 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, 2023, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_prefetch(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_ddt_prune(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); enum zpool_options { ZPOOL_OPTION_POWER = 1024, ZPOOL_OPTION_ALLOW_INUSE, ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH, ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH, ZPOOL_OPTION_POOL_KEY_GUID, ZPOOL_OPTION_JSON_NUMS_AS_INT, ZPOOL_OPTION_JSON_FLAT_VDEVS }; /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_PREFETCH, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, NULL}, }; static const char *pool_scan_func_str[] = { "NONE", "SCRUB", "RESILVER", "ERRORSCRUB" }; static const char *pool_scan_state_str[] = { "NONE", "SCANNING", "FINISHED", "CANCELED", "ERRORSCRUBBING" }; static const char *vdev_rebuild_state_str[] = { "NONE", "ACTIVE", "CANCELED", "COMPLETE" }; static const char *checkpoint_state_str[] = { "NONE", "EXISTS", "DISCARDING" }; static const char *vdev_state_str[] = { "UNKNOWN", "CLOSED", "OFFLINE", "REMOVED", "CANT_OPEN", "FAULTED", "DEGRADED", "ONLINE" }; static const char *vdev_aux_str[] = { "NONE", "OPEN_FAILED", "CORRUPT_DATA", "NO_REPLICAS", "BAD_GUID_SUM", "TOO_SMALL", "BAD_LABEL", "VERSION_NEWER", "VERSION_OLDER", "UNSUP_FEAT", "SPARED", "ERR_EXCEEDED", "IO_FAILURE", "BAD_LOG", "EXTERNAL", "SPLIT_POOL", "BAD_ASHIFT", "EXTERNAL_PERSIST", "ACTIVE", "CHILDREN_OFFLINE", "ASHIFT_TOO_BIG" }; static const char *vdev_init_state_str[] = { "NONE", "ACTIVE", "CANCELED", "SUSPENDED", "COMPLETE" }; static const char *vdev_trim_state_str[] = { "NONE", "ACTIVE", "CANCELED", "SUSPENDED", "COMPLETE" }; #define ZFS_NICE_TIMESTAMP 100 /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { "prefetch", zpool_do_prefetch, HELP_PREFETCH }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, + { NULL }, + { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" #define MAX_CMD_LEN 256 static zpool_command_t *current_command; static zfs_type_t current_prop_type = (ZFS_TYPE_POOL | ZFS_TYPE_VDEV); static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-afgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [[--power]|[-nF]] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] [-j " "[--json-int, --json-pool-key-guid]] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_PREFETCH: return (gettext("\tprefetch -t [] \n" "\t -t ddt \n")); case HELP_OFFLINE: return (gettext("\toffline [--power]|[[-f][-t]] " " ...\n")); case HELP_ONLINE: return (gettext("\tonline [--power][-e] " "...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [--power] [-j [--json-int, " "--json-flat-vdevs, ...\n" "\t --json-pool-key-guid]] [-c [script1,script2,...]] " "[-DegiLpPstvx] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-j [--json-int, " "--json-pool-key-guid]] ...\n" "\t [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n" "\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid [-g guid] \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion [-j]\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); + case HELP_DDT_PRUNE: + return (gettext("\tddtprune -d|-p \n")); default: __builtin_unreachable(); } } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_pool_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Callback routine that will print out a vdev property value. */ static int print_vdev_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", vdev_prop_to_name(prop)); if (vdev_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (vdev_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", vdev_prop_values(prop)); return (ZPROP_CONT); } /* * Given a leaf vdev name like 'L5' return its VDEV_CONFIG_PATH like * '/dev/disk/by-vdev/L5'. */ static const char * vdev_name_to_path(zpool_handle_t *zhp, char *vdev) { nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); if (vdev_nv == NULL) { return (NULL); } return (fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH)); } static int zpool_power_on(zpool_handle_t *zhp, char *vdev) { return (zpool_power(zhp, vdev, B_TRUE)); } static int zpool_power_on_and_disk_wait(zpool_handle_t *zhp, char *vdev) { int rc; rc = zpool_power_on(zhp, vdev); if (rc != 0) return (rc); zpool_disk_wait(vdev_name_to_path(zhp, vdev)); return (0); } static int zpool_power_on_pool_and_wait_for_devices(zpool_handle_t *zhp) { nvlist_t *nv; const char *path = NULL; int rc; /* Power up all the devices first */ FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); if (path != NULL) { rc = zpool_power_on(zhp, (char *)path); if (rc != 0) { return (rc); } } } /* * Wait for their devices to show up. Since we powered them on * at roughly the same time, they should all come online around * the same time. */ FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); zpool_disk_wait(path); } return (0); } static int zpool_power_off(zpool_handle_t *zhp, char *vdev) { return (zpool_power(zhp, vdev, B_FALSE)); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nFor further help on a command or topic, " "run: %s\n"), "zpool help []"); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && current_prop_type != (ZFS_TYPE_POOL | ZFS_TYPE_VDEV) && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, "%s", gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ if (current_prop_type == ZFS_TYPE_POOL) { (void) zprop_iter(print_pool_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES " "disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties " "must be appended with a feature name.\n" "See zpool-features(7).\n")); } else if (current_prop_type == ZFS_TYPE_VDEV) { (void) zprop_iter(print_vdev_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); } } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s | -u] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -u Uninitialize. Clears initialization state. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"uninit", no_argument, NULL, 'u'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csuw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'u': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_UNINIT) { (void) fprintf(stderr, gettext("-u cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_UNINIT; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c, -s" "or -u\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; const char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } static void nice_num_str_nvlist(nvlist_t *item, const char *key, uint64_t value, boolean_t literal, boolean_t as_int, int format) { char buf[256]; if (literal) { if (!as_int) snprintf(buf, 256, "%llu", (u_longlong_t)value); } else { switch (format) { case ZFS_NICENUM_1024: zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_1024); break; case ZFS_NICENUM_BYTES: zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_BYTES); break; case ZFS_NICENUM_TIME: zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_TIME); break; case ZFS_NICE_TIMESTAMP: format_timestamp(value, buf, 256); break; default: fprintf(stderr, "Invalid number format"); exit(1); } } if (as_int) fnvlist_add_uint64(item, key, value); else fnvlist_add_string(item, key, buf); } /* * Generates an nvlist with output version for every command based on params. * Purpose of this is to add a version of JSON output, considering the schema * format might be updated for each command in future. * * Schema: * * "output_version": { * "command": string, * "vers_major": integer, * "vers_minor": integer, * } */ static nvlist_t * zpool_json_schema(int maj_v, int min_v) { char cmd[MAX_CMD_LEN]; nvlist_t *sch = fnvlist_alloc(); nvlist_t *ov = fnvlist_alloc(); snprintf(cmd, MAX_CMD_LEN, "zpool %s", current_command->name); fnvlist_add_string(ov, "command", cmd); fnvlist_add_uint32(ov, "vers_major", maj_v); fnvlist_add_uint32(ov, "vers_minor", min_v); fnvlist_add_nvlist(sch, "output_version", ov); fnvlist_free(ov); return (sch); } static void fill_pool_info(nvlist_t *list, zpool_handle_t *zhp, boolean_t addtype, boolean_t as_int) { nvlist_t *config = zpool_get_config(zhp, NULL); uint64_t guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); uint64_t txg = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG); fnvlist_add_string(list, "name", zpool_get_name(zhp)); if (addtype) fnvlist_add_string(list, "type", "POOL"); fnvlist_add_string(list, "state", zpool_get_state_str(zhp)); if (as_int) { if (guid) fnvlist_add_uint64(list, ZPOOL_CONFIG_POOL_GUID, guid); if (txg) fnvlist_add_uint64(list, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(list, "spa_version", SPA_VERSION); fnvlist_add_uint64(list, "zpl_version", ZPL_VERSION); } else { char value[ZFS_MAXPROPLEN]; if (guid) { snprintf(value, ZFS_MAXPROPLEN, "%llu", (u_longlong_t)guid); fnvlist_add_string(list, ZPOOL_CONFIG_POOL_GUID, value); } if (txg) { snprintf(value, ZFS_MAXPROPLEN, "%llu", (u_longlong_t)txg); fnvlist_add_string(list, ZPOOL_CONFIG_POOL_TXG, value); } fnvlist_add_string(list, "spa_version", SPA_VERSION_STRING); fnvlist_add_string(list, "zpl_version", ZPL_VERSION_STRING); } } static void used_by_other(zpool_handle_t *zhp, nvlist_t *nvdev, nvlist_t *list) { spare_cbdata_t spare_cb; verify(nvlist_lookup_uint64(nvdev, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) != 0) { fnvlist_add_string(list, "used_by", zpool_get_name(spare_cb.cb_zhp)); } zpool_close(spare_cb.cb_zhp); } } static void fill_vdev_info(nvlist_t *list, zpool_handle_t *zhp, char *name, boolean_t addtype, boolean_t as_int) { boolean_t l2c = B_FALSE; const char *path, *phys, *devid, *bias = NULL; uint64_t hole = 0, log = 0, spare = 0; vdev_stat_t *vs; uint_t c; nvlist_t *nvdev; nvlist_t *nvdev_parent = NULL; char *_name; if (strcmp(name, zpool_get_name(zhp)) != 0) _name = name; else _name = (char *)"root-0"; nvdev = zpool_find_vdev(zhp, _name, NULL, &l2c, NULL); fnvlist_add_string(list, "name", name); if (addtype) fnvlist_add_string(list, "type", "VDEV"); if (nvdev) { const char *type = fnvlist_lookup_string(nvdev, ZPOOL_CONFIG_TYPE); if (type) fnvlist_add_string(list, "vdev_type", type); uint64_t guid = fnvlist_lookup_uint64(nvdev, ZPOOL_CONFIG_GUID); if (guid) { if (as_int) { fnvlist_add_uint64(list, "guid", guid); } else { char buf[ZFS_MAXPROPLEN]; snprintf(buf, ZFS_MAXPROPLEN, "%llu", (u_longlong_t)guid); fnvlist_add_string(list, "guid", buf); } } if (nvlist_lookup_string(nvdev, ZPOOL_CONFIG_PATH, &path) == 0) fnvlist_add_string(list, "path", path); if (nvlist_lookup_string(nvdev, ZPOOL_CONFIG_PHYS_PATH, &phys) == 0) fnvlist_add_string(list, "phys_path", phys); if (nvlist_lookup_string(nvdev, ZPOOL_CONFIG_DEVID, &devid) == 0) fnvlist_add_string(list, "devid", devid); (void) nvlist_lookup_uint64(nvdev, ZPOOL_CONFIG_IS_LOG, &log); (void) nvlist_lookup_uint64(nvdev, ZPOOL_CONFIG_IS_SPARE, &spare); (void) nvlist_lookup_uint64(nvdev, ZPOOL_CONFIG_IS_HOLE, &hole); if (hole) fnvlist_add_string(list, "class", VDEV_TYPE_HOLE); else if (l2c) fnvlist_add_string(list, "class", VDEV_TYPE_L2CACHE); else if (spare) fnvlist_add_string(list, "class", VDEV_TYPE_SPARE); else if (log) fnvlist_add_string(list, "class", VDEV_TYPE_LOG); else { (void) nvlist_lookup_string(nvdev, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); if (bias != NULL) fnvlist_add_string(list, "class", bias); else { nvdev_parent = NULL; nvdev_parent = zpool_find_parent_vdev(zhp, _name, NULL, NULL, NULL); /* * With a mirrored special device, the parent * "mirror" vdev will have * ZPOOL_CONFIG_ALLOCATION_BIAS set to "special" * not the leaf vdevs. If we're a leaf vdev * in that case we need to look at our parent * to see if they're "special" to know if we * are "special" too. */ if (nvdev_parent) { (void) nvlist_lookup_string( nvdev_parent, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); } if (bias != NULL) fnvlist_add_string(list, "class", bias); else fnvlist_add_string(list, "class", "normal"); } } if (nvlist_lookup_uint64_array(nvdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0) { fnvlist_add_string(list, "state", vdev_state_str[vs->vs_state]); } } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, const char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; const char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *cname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && (!zpool_prop_feature(propname) && !zpool_prop_vdev(propname))) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool or vdev property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * if version is specified, only "legacy" compatibility * may be requested */ if ((prop == ZPOOL_PROP_COMPATIBILITY && strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, cname) && strcmp(fnvlist_lookup_string(proplist, cname), ZPOOL_COMPAT_LEGACY) != 0)) { (void) fprintf(stderr, gettext("when 'version' is " "specified, the 'compatibility' feature may only " "be set to '" ZPOOL_COMPAT_LEGACY "'\n")); return (2); } if (zpool_prop_feature(propname) || zpool_prop_vdev(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, const char *propval, nvlist_t **props) { const char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-afgLnP] [-o property=value] ... * * -a Disable the ashift validation checks * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t check_replication = B_TRUE; boolean_t check_inuse = B_TRUE; boolean_t dryrun = B_FALSE; boolean_t check_ashift = B_TRUE; boolean_t force = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; struct option long_options[] = { {"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE}, {"allow-replication-mismatch", no_argument, NULL, ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH}, {"allow-ashift-mismatch", no_argument, NULL, ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL)) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case ZPOOL_OPTION_ALLOW_INUSE: check_inuse = B_FALSE; break; case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH: check_replication = B_FALSE; break; case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH: check_ashift = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } if (force) { if (!check_inuse || !check_replication || !check_ashift) { (void) fprintf(stderr, gettext("'-f' option is not " "allowed with '--allow-replication-mismatch', " "'--allow-ashift-mismatch', or " "'--allow-in-use'\n")); usage(B_FALSE); } check_inuse = B_FALSE; check_replication = B_FALSE; check_ashift = B_FALSE; } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, !check_inuse, check_replication, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finally the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot, check_ashift) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { zpool_close(zhp); (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * Return 1 if a vdev is active (being used in a pool) * Return 0 if a vdev is inactive (offlined or faulted, or not in active pool) * * This is useful for checking if a disk in an active pool is offlined or * faulted. */ static int vdev_is_active(char *vdev_path) { int fd; fd = open(vdev_path, O_EXCL); if (fd < 0) { return (1); /* cant open O_EXCL - disk is active */ } close(fd); return (0); /* disk is inactive in the pool */ } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } (void) strlcpy(vdev, argv[0], sizeof (vdev)); /* * If we cannot open an absolute path, we quit. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ if ((fd = open(vdev, O_RDWR)) < 0) { int error; if (vdev[0] == '/') { (void) fprintf(stderr, gettext("failed to open " "%s: %s\n"), vdev, strerror(errno)); return (1); } error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || ((fd = open(vdev, O_RDWR)) < 0)) { if (errno == ENOENT) { (void) fprintf(stderr, gettext( "failed to find device %s, try " "specifying absolute path instead\n"), argv[0]); return (1); } (void) fprintf(stderr, gettext("failed to open %s:" " %s\n"), vdev, strerror(errno)); return (1); } } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: /* * We allow the user to call 'zpool offline -f' * on an offlined disk in an active pool. We can check if * the disk is online by calling vdev_is_active(). */ if (force && !vdev_is_active(vdev)) break; (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\""), vdev, zpool_pool_state_to_name(state), name); if (force) { (void) fprintf(stderr, gettext( ". Offline the disk first to clear its label.")); } printf("\n"); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 0); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set * - warn if it's enabled but not in compat */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; const char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) { (void) nvlist_remove_all(props, propname); } else if (strcmp(propval, ZFS_FEATURE_ENABLED) == 0 && !requested_features[i]) { (void) fprintf(stderr, gettext( "Warning: feature \"%s\" enabled " "but is not in specified " "'compatibility' feature set.\n"), feat->fi_uname); } } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_share(pool, NULL); zfs_commit_shares(NULL); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { tpool_t *tpool; pthread_mutex_t mnttab_lock; boolean_t force; boolean_t hardforce; int retval; } export_cbdata_t; typedef struct { char *aea_poolname; export_cbdata_t *aea_cbdata; } async_export_args_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; /* * zpool_disable_datasets() is not thread-safe for mnttab access. * So we serialize access here for 'zpool export -a' parallel case. */ if (cb->tpool != NULL) pthread_mutex_lock(&cb->mnttab_lock); int retval = zpool_disable_datasets(zhp, cb->force); if (cb->tpool != NULL) pthread_mutex_unlock(&cb->mnttab_lock); if (retval) return (1); if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * Asynchronous export request */ static void zpool_export_task(void *arg) { async_export_args_t *aea = arg; zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname); if (zhp != NULL) { int ret = zpool_export_one(zhp, aea->aea_cbdata); if (ret != 0) aea->aea_cbdata->retval = ret; zpool_close(zhp); } else { aea->aea_cbdata->retval = 1; } free(aea->aea_poolname); free(aea); } /* * Process an export request in parallel */ static int zpool_export_one_async(zpool_handle_t *zhp, void *data) { tpool_t *tpool = ((export_cbdata_t *)data)->tpool; async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t)); /* save pool name since zhp will go out of scope */ aea->aea_poolname = strdup(zpool_get_name(zhp)); aea->aea_cbdata = data; /* ship off actual export to another thread */ if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0) return (errno); /* unlikely */ else return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; cb.tpool = NULL; cb.retval = 0; argc -= optind; argv += optind; /* The history will be logged as part of the export itself */ log_history = B_FALSE; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); pthread_mutex_init(&cb.mnttab_lock, NULL); /* Asynchronously call zpool_export_one using thread pool */ ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one_async, &cb); tpool_wait(cb.tpool); tpool_destroy(cb.tpool); (void) pthread_mutex_destroy(&cb.mnttab_lock); return (ret | cb.retval); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { static const char *const subtypes[] = {ZPOOL_CONFIG_SPARES, ZPOOL_CONFIG_L2CACHE, ZPOOL_CONFIG_CHILDREN}; char *name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); max = MAX(strlen(name) + depth, max); free(name); nvlist_t **child; uint_t children; for (size_t i = 0; i < ARRAY_SIZE(subtypes); ++i) if (nvlist_lookup_nvlist_array(nv, subtypes[i], &child, &children) == 0) for (uint_t c = 0; c < children; ++c) max = MAX(max_width(zhp, child[c], depth + 2, max, name_flags), max); return (max); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; boolean_t cb_print_power; boolean_t cb_json; boolean_t cb_flat_vdevs; nvlist_t *cb_jsobj; boolean_t cb_json_as_int; boolean_t cb_json_pool_key_guid; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static boolean_t is_blank_str(const char *str) { for (; str != NULL && *str != '\0'; ++str) if (!isblank(*str)) return (B_FALSE); return (B_TRUE); } static void zpool_nvlist_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, const char *path, nvlist_t *item) { vdev_cmd_data_t *data; int i, j, k = 1; char tmp[256]; const char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) continue; data = &vcdl->data[i]; for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { val = data->lines[k]; break; } } if (val == NULL || is_blank_str(val)) val = "-"; fnvlist_add_string(item, vcdl->uniq_cols[j], val); } for (j = data->cols_cnt; j < data->lines_cnt; j++) { if (data->lines[j]) { snprintf(tmp, 256, "extra_%d", k++); fnvlist_add_string(item, tmp, data->lines[j]); } } break; } } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, const char *path) { vdev_cmd_data_t *data; int i, j; const char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) fputs(" ", stdout); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) fputs(" ", stdout); val = data->lines[j]; fputs(val ?: "", stdout); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) ctime_r(&t, tbuf); tbuf[24] = 0; switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) ctime_r(&t, tbuf); tbuf[24] = 0; switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static const char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Called for each leaf vdev. Returns 0 if the vdev is healthy. * A vdev is unhealthy if any of the following are true: * 1) there are read, write, or checksum errors, * 2) its state is not ONLINE, or * 3) slow IO reporting was requested (-s) and there are slow IOs. */ static int vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) { status_cbdata_t *cb = data; vdev_stat_t *vs; uint_t vsc; (void) hdl_data; if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) != 0) return (1); if (vs->vs_checksum_errors || vs->vs_read_errors || vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) return (1); if (cb->cb_print_slow_ios && vs->vs_slow_ios) return (1); return (0); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; const char *type; const char *path = NULL; const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } /* * If '-e' is specified then top-level vdevs and their children * can be pruned if all of their leaves are healthy. */ if (cb->cb_print_unhealthy && depth > 0 && for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { return; } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (vs->vs_slow_ios) scolor = ANSI_BLUE; if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); fputc(' ', stdout); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); fputc(' ', stdout); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); fputc(' ', stdout); printf_color(rcolor, "%5s", rbuf); fputc(' ', stdout); printf_color(wcolor, "%5s", wbuf); fputc(' ', stdout); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf_color(scolor, " %5llu", (u_longlong_t)vs->vs_slow_ios); else printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { /* Only leaf vdevs have physical slots */ switch (zpool_power_current_state(zhp, (char *) fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH))) { case 0: printf_color(ANSI_RED, " %5s", gettext("off")); break; case 1: printf(" %5s", gettext("on")); break; default: printf(" %5s", "-"); } } else { printf(" %5s", "-"); } } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: if (vs->vs_read_errors + vs->vs_write_errors + vs->vs_checksum_errors == 0 && children == 0 && vs->vs_slow_ios > 0) { (void) printf(gettext("too many slow I/Os")); } else { (void) printf(gettext("too many errors")); } break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } if (vs->vs_scan_removing != 0) { (void) printf(gettext(" (removing)")); } else if (VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) { (void) printf(gettext(" (non-allocating)")); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); /* * If you force fault a drive that's resilvering, its scan stats can * get frozen in time, giving the false impression that it's * being resilvered. That's why we check the state to see if the vdev * is healthy before reporting "resilvering" or "repairing". */ if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; const char *type; char *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = (char *)VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static int show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; const char *name; uint64_t guid; uint64_t hostid = 0; const char *msgid; const char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; const char *comment; const char *indent; char buf[2048]; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); /* * If we're importing using a cachefile, then we won't report any * errors unless we are in the scan phase of the import. */ if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) { indent = " "; } else { comment = NULL; indent = ""; } (void) printf(gettext("%s pool: %s\n"), indent, name); (void) printf(gettext("%s id: %llu\n"), indent, (u_longlong_t)guid); (void) printf(gettext("%s state: %s"), indent, health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); if (reason != ZPOOL_STATUS_OK) { (void) printf("%s", indent); printf_color(ANSI_BOLD, gettext("status: ")); } switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_YELLOW, gettext("One or more devices " "contains corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_YELLOW, gettext("The pool data is " "corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_YELLOW, gettext("Some supported " "features are not enabled on the pool.\n" "\t%s(Note that they may be intentionally disabled if the\n" "\t%s'compatibility' property is set.)\n"), indent, indent); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "\t%sproperty.\n"), indent); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" "\t%srequested by the 'compatibility' property.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_collect_unsup_feat(config, buf, 2048); (void) printf("%s", buf); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n" "\t%scannot be accessed in read-write mode because it uses " "the following\n" "\t%sfeature(s) not supported on this system:\n"), indent, indent); color_start(ANSI_YELLOW); zpool_collect_unsup_feat(config, buf, 2048); (void) printf("%s", buf); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n" "\t%sbe safely imported when the system hostid is not " "set.\n"), indent); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\t%sExpect reduced performance.\n"), indent); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state != VDEV_STATE_HEALTHY || reason != ZPOOL_STATUS_ERRATA || errata != ZPOOL_ERRATA_NONE) { (void) printf("%s", indent); (void) printf(gettext("action: ")); } if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext("The pool can be imported using " "its name or numeric identifier, though\n" "\t%ssome features will not be available without " "an explicit 'zpool upgrade'.\n"), indent); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext("The pool can be imported using " "its name or numeric\n" "\t%sidentifier, though the file(s) indicated by " "its 'compatibility'\n" "\t%sproperty cannot be parsed at this time.\n"), indent, indent); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext("The pool can be imported using " "its name or numeric identifier and\n" "\t%sthe '-f' flag.\n"), indent); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext("The pool can be " "imported using its name or numeric " "identifier,\n" "\t%showever there is a compatibility " "issue which should be corrected\n" "\t%sby running 'zpool scrub'\n"), indent, indent); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext("The pool cannot be " "imported with this version of ZFS due to\n" "\t%san active asynchronous destroy. " "Revert to an earlier version\n" "\t%sand allow the destroy to complete " "before updating.\n"), indent, indent); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("Existing encrypted " "datasets contain an on-disk " "incompatibility, which\n" "\t%sneeds to be corrected. Backup these " "datasets to new encrypted datasets\n" "\t%sand destroy the old ones.\n"), indent, indent); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("Existing encrypted " "snapshots and bookmarks contain an " "on-disk\n" "\t%sincompatibility. This may cause " "on-disk corruption if they are used\n" "\t%swith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n" "\t%sNo additional action is needed if " "there are no encrypted snapshots or\n" "\t%sbookmarks. If preserving the " "encrypted snapshots and bookmarks is\n" "\t%srequired, use a non-raw send to " "backup and restore them. Alternately,\n" "\t%sthey may be removed to resolve the " "incompatibility.\n"), indent, indent, indent, indent, indent, indent); break; default: /* * All errata must contain an action message. */ assert(errata == ZPOOL_ERRATA_NONE); } } else { (void) printf(gettext("The pool can be imported using " "its name or numeric identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext("The pool can be imported despite " "missing or damaged devices. The\n" "\t%sfault tolerance of the pool may be compromised if " "imported.\n"), indent); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext("The pool cannot be imported. " "Access the pool on a system running newer\n" "\t%ssoftware, or recreate the pool from " "backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("The pool cannot be imported. " "Access the pool on a system that supports\n" "\t%sthe required feature(s), or recreate the pool " "from backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("The pool cannot be imported in " "read-write mode. Import the pool with\n" "\t%s'-o readonly=on', access the pool on a system " "that supports the\n" "\t%srequired feature(s), or recreate the pool " "from backup.\n"), indent, indent); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext("The pool cannot be imported. " "Attach the missing\n" "\t%sdevices and try again.\n"), indent); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext("The pool must be exported from " "%s (hostid=%"PRIx64")\n" "\t%sbefore it can be safely imported.\n"), hostname, hostid, indent); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext("Set a unique system hostid with " "the zgenhostid(8) command.\n")); break; default: (void) printf(gettext("The pool cannot be imported due " "to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (comment != NULL) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if ((vs->vs_state == VDEV_STATE_CLOSED || vs->vs_state == VDEV_STATE_CANT_OPEN) && vs->vs_aux == VDEV_AUX_CORRUPT_DATA) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\t%sThe pool was destroyed, " "but can be imported using the '-Df' flags.\n"), indent); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\t%sThe pool may be active on " "another system, but can be imported using\n" "\t%sthe '-f' flag.\n"), indent, indent); } if (msgid != NULL) { (void) printf(gettext("%s see: " "https://openzfs.github.io/openzfs-docs/msg/%s\n"), indent, msgid); } (void) printf(gettext("%sconfig:\n\n"), indent); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\t%sAdditional devices are known to " "be part of this pool, though their\n" "\t%sexact configuration cannot be determined.\n"), indent, indent); } return (0); } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); /* * The hostid on LOAD_INFO comes from the MOS label via * spa_tryimport(). If its not there then we're likely talking to an * older kernel, so use the top one, which will be from the label * discovered in zpool_find_import(), or if a cachefile is in use, the * local hostid. */ if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags, uint_t mntthreads) { int ret = 0; int ms_status = 0; zpool_handle_t *zhp; const char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { const char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%"PRIx64")\nExport the pool on the other " "system, then run 'zpool import'.\n"), name, hostname, hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { const char *hostname = ""; time_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_HOSTNAME); else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID); else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%"PRIx64") at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, hostid, ctime(×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS && zfs_crypto_attempt_load_keys(g_zfs, name) != 0) ret = 1; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY)) { ms_status = zpool_enable_datasets(zhp, mntopts, 0, mntthreads); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to share some datasets\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to mount some datasets\n")); } } zpool_close(zhp); return (ret); } typedef struct import_parameters { nvlist_t *ip_config; const char *ip_mntopts; nvlist_t *ip_props; int ip_flags; uint_t ip_mntthreads; int *ip_err; } import_parameters_t; static void do_import_task(void *arg) { import_parameters_t *ip = arg; *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, ip->ip_props, ip->ip_flags, ip->ip_mntthreads); free(ip); } static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, char *orig_name, char *new_name, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; boolean_t pool_specified = (import->poolname != NULL || import->guid != 0); uint_t npools = 0; tpool_t *tp = NULL; if (import->do_all) { tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); } /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; if (!pool_specified && import->do_all) { while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) npools++; } while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!import->do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (import->do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, import->policy) == 0); if (!pool_specified) { if (first) first = B_FALSE; else if (!import->do_all) (void) fputc('\n', stdout); if (import->do_all) { import_parameters_t *ip = safe_malloc( sizeof (import_parameters_t)); ip->ip_config = config; ip->ip_mntopts = mntopts; ip->ip_props = props; ip->ip_flags = flags; ip->ip_mntthreads = mount_tp_nthr / npools; ip->ip_err = &err; (void) tpool_dispatch(tp, do_import_task, (void *)ip); } else { /* * If we're importing from cachefile, then * we don't want to report errors until we * are in the scan phase of the import. If * we get an error, then we return that error * to invoke the scan phase. */ if (import->cachefile && !import->scan) err = show_import(config, B_FALSE); else (void) show_import(config, B_TRUE); } } else if (import->poolname != NULL) { const char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, import->poolname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), import->poolname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == import->guid) found_config = config; } } if (import->do_all) { tpool_wait(tp); tpool_destroy(tp); } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (pool_specified && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), orig_name); err = B_TRUE; } else { err |= do_import(found_config, new_name, mntopts, props, flags, mount_tp_nthr); } } /* * If we were just looking for pools, report an error if none were * found. */ if (!pool_specified && first) (void) fprintf(stderr, gettext("no pools available to import\n")); return (err); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { const char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool prefetch [] * * Prefetchs a particular type of data in the specified pool. */ int zpool_do_prefetch(int argc, char **argv) { int c; char *poolname; char *typestr = NULL; zpool_prefetch_type_t type; zpool_handle_t *zhp; int err = 0; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': typestr = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if (strcmp(typestr, "ddt") == 0) { type = ZPOOL_PREFETCH_DDT; } else { (void) fprintf(stderr, gettext("unsupported prefetch type\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); err = zpool_prefetch(zhp, type); zpool_close(zhp); return (err); } /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] [-n] [-F] * [newpool] * * -c Read pool information from a cachefile instead of searching * devices. If importing from a cachefile config fails, then * fallback to searching for devices only in the directories that * exist in the cachefile. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *policy = NULL; nvlist_t *props = NULL; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if (cachefile && do_scan) { (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir, *tmp = NULL; envdup = strdup(env); for (dir = strtok_r(envdup, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) { searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = dir; } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; idata.do_destroyed = do_destroyed; idata.do_all = do_all; libpc_handle_t lpch = { .lpc_lib_handle = g_zfs, .lpc_ops = &libzfs_config_ops, .lpc_printerr = B_TRUE }; pools = zpool_search_import(&lpch, &idata); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { free(searchdirs); free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); /* * If we're using the cachefile and we failed to import, then * fallback to scanning the directory for pools that match * those in the cachefile. */ if (err != 0 && cachefile != NULL) { (void) printf(gettext("cachefile import failed, retrying\n")); /* * We use the scan flag to gather the directories that exist * in the cachefile. If we need to fallback to searching for * the pool config, we will only search devices in these * directories. */ idata.scan = B_TRUE; nvlist_free(pools); pools = zpool_search_import(&lpch, &idata); err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); } error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); free(searchdirs); free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_namewidth; int cb_iteration; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; vdev_cbdata_t cb_vdevs; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"rebuild", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {"rebuildq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {"rebuild", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {"rebuild"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; color_start(ANSI_BOLD); if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); color_end(); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Prints a size string (i.e. 120M) with the suffix ("M") colored * by order of magnitude. Uses column_size to add padding. */ static void print_stat_color(const char *statbuf, unsigned int column_size) { fputs(" ", stdout); size_t len = strlen(statbuf); while (len < column_size) { fputc(' ', stdout); column_size--; } if (*statbuf == '0') { color_start(ANSI_GRAY); fputc('0', stdout); } else { for (; *statbuf; statbuf++) { if (*statbuf == 'K') color_start(ANSI_GREEN); else if (*statbuf == 'M') color_start(ANSI_YELLOW); else if (*statbuf == 'G') color_start(ANSI_RED); else if (*statbuf == 'T') color_start(ANSI_BOLD_BLUE); else if (*statbuf == 'P') color_start(ANSI_MAGENTA); else if (*statbuf == 'E') color_start(ANSI_CYAN); fputc(*statbuf, stdout); if (--column_size <= 0) break; } } color_end(); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else print_stat_color(buf, column_size); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *newnv) { const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (int i = 0; i < ARRAY_SIZE(names); i++) { uint64_t val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *const class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdevs.cb_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdevs.cb_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdevs.cb_names_count && (i == cb->cb_vdevs.cb_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { const char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdevs.cb_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); size_t poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(void *zhp_data, nvlist_t *nv, void *cb_data) { uint64_t guid; vdev_cbdata_t *cb = cb_data; zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (0); return (guid == zpool_vdev_path_to_guid(zhp, cb->cb_names[0])); } /* * Returns 1 if cb_data->cb_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_names for a second... */ tmp_name = cb->cb_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_vdevs.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, &cb->cb_vdevs)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, &cb->cb_vdevs)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, (char *)"-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { if (snprintf(fullpath, sizeof (fullpath), "%s/%s", dirpath, ent->d_name) >= sizeof (fullpath)) { (void) fprintf(stderr, gettext("internal error: " "ZPOOL_SCRIPTS_PATH too large.\n")); exit(1); } /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(const char *subcommand) { char *dir, *sp, *tmp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; for (dir = strtok_r(sp, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) print_zpool_dir_scripts(dir); free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb.cb_vdevs)) { /* All the args are vdevs */ cb.cb_vdevs.cb_names = argv; cb.cb_vdevs.cb_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { /* ...and the rest are vdev names */ cb.cb_vdevs.cb_names = argv + 1; cb.cb_vdevs.cb_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdevs.cb_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, ZFS_TYPE_POOL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdevs.cb_names, cb.cb_vdevs.cb_names_count, cb.cb_vdevs.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fflush(stdout); (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdevs.cb_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fflush(stdout); (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_json; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; nvlist_t *cb_jsobj; boolean_t cb_json_as_int; boolean_t cb_json_pool_key_guid; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) fputs(" ", stdout); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) fputs(header, stdout); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) fputc('\n', stdout); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void collect_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; const char *propstr; boolean_t right_justify; size_t width; zprop_source_t sourcetype = ZPROP_SRC_NONE; nvlist_t *item, *d, *props; item = d = props = NULL; if (cb->cb_json) { item = fnvlist_alloc(); props = fnvlist_alloc(); d = fnvlist_lookup_nvlist(cb->cb_jsobj, "pools"); if (d == NULL) { fprintf(stderr, "pools obj not found.\n"); exit(1); } fill_pool_info(item, zhp, B_TRUE, cb->cb_json_as_int); } for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!cb->cb_json && !first) { if (cb->cb_scripted) (void) fputc('\t', stdout); else (void) fputs(" ", stdout); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), &sourcetype, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; sourcetype = ZPROP_SRC_LOCAL; } else if (zfs_prop_user(pl->pl_user_prop) && zpool_get_userprop(zhp, pl->pl_user_prop, property, sizeof (property), &sourcetype) == 0) { propstr = property; } else { propstr = "-"; } if (cb->cb_json) { if (pl->pl_prop == ZPOOL_PROP_NAME) continue; (void) zprop_nvlist_one_property( zpool_prop_to_name(pl->pl_prop), propstr, sourcetype, NULL, NULL, props, cb->cb_json_as_int); } else { /* * If this is being called in scripted mode, or if this * is the last column and it is left-justified, don't * include a width format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } } if (cb->cb_json) { fnvlist_add_nvlist(item, "properties", props); if (cb->cb_json_pool_key_guid) { char pool_guid[256]; uint64_t guid = fnvlist_lookup_uint64( zpool_get_config(zhp, NULL), ZPOOL_CONFIG_POOL_GUID); snprintf(pool_guid, 256, "%llu", (u_longlong_t)guid); fnvlist_add_nvlist(d, pool_guid, item); } else { fnvlist_add_nvlist(d, zpool_get_name(zhp), item); } fnvlist_free(props); fnvlist_free(item); } else (void) fputc('\n', stdout); } static void collect_vdev_prop(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format, boolean_t json, nvlist_t *nvl, boolean_t as_int) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: case ZPOOL_PROP_DEDUPCACHED: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; (void) strlcpy(propval, str, sizeof (propval)); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (json) { zprop_nvlist_one_property(zpool_prop_to_name(prop), propval, ZPROP_SRC_NONE, NULL, NULL, nvl, as_int); } else { if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } } /* * print static default line per vdev * not compatible with '-o' option */ static void collect_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare, nvlist_t *item) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children = 0; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; nvlist_t *props, *ent, *ch, *obj, *l2c, *sp; props = ent = ch = obj = sp = l2c = NULL; const char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (cb->cb_json) { props = fnvlist_alloc(); ent = fnvlist_alloc(); fill_vdev_info(ent, zhp, (char *)name, B_FALSE, cb->cb_json_as_int); } else { if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ if (VDEV_STAT_VALID(vs_pspace, c) && vs->vs_pspace) { collect_vdev_prop(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, scripted, B_TRUE, format, cb->cb_json, props, cb->cb_json_as_int); } else { collect_vdev_prop(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); } collect_vdev_prop(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); collect_vdev_prop(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); collect_vdev_prop(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); collect_vdev_prop(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format, cb->cb_json, props, cb->cb_json_as_int); collect_vdev_prop(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format, cb->cb_json, props, cb->cb_json_as_int); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); collect_vdev_prop(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); collect_vdev_prop(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format, cb->cb_json, props, cb->cb_json_as_int); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } collect_vdev_prop(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format, cb->cb_json, props, cb->cb_json_as_int); if (cb->cb_json) { fnvlist_add_nvlist(ent, "properties", props); fnvlist_free(props); } else (void) fputc('\n', stdout); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (cb->cb_json) { fnvlist_add_nvlist(item, name, ent); fnvlist_free(ent); } return; } if (cb->cb_json) { ch = fnvlist_alloc(); } /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (name == NULL || cb->cb_json != B_TRUE) collect_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE, item); else if (cb->cb_json) { collect_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE, ch); } free(vname); } if (cb->cb_json) { if (!nvlist_empty(ch)) fnvlist_add_nvlist(ent, "vdevs", ch); fnvlist_free(ch); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; if (cb->cb_json) obj = fnvlist_alloc(); for (c = 0; c < children; c++) { const char *bias = NULL; const char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed && !cb->cb_json) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); collect_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE, obj); free(vname); } if (cb->cb_json) { if (!nvlist_empty(obj)) fnvlist_add_nvlist(item, class_name[n], obj); fnvlist_free(obj); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { if (cb->cb_json) { l2c = fnvlist_alloc(); } else { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); } for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); collect_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE, l2c); free(vname); } if (cb->cb_json) { if (!nvlist_empty(l2c)) fnvlist_add_nvlist(item, "l2cache", l2c); fnvlist_free(l2c); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { if (cb->cb_json) { sp = fnvlist_alloc(); } else { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); } for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); collect_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE, sp); free(vname); } if (cb->cb_json) { if (!nvlist_empty(sp)) fnvlist_add_nvlist(item, "spares", sp); fnvlist_free(sp); } } if (name != NULL && cb->cb_json) { fnvlist_add_nvlist(item, name, ent); fnvlist_free(ent); } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { nvlist_t *p, *d, *nvdevs; uint64_t guid; char pool_guid[256]; const char *pool_name = zpool_get_name(zhp); list_cbdata_t *cbp = data; p = d = nvdevs = NULL; collect_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (cbp->cb_json) { d = fnvlist_lookup_nvlist(cbp->cb_jsobj, "pools"); if (cbp->cb_json_pool_key_guid) { guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); snprintf(pool_guid, 256, "%llu", (u_longlong_t)guid); p = fnvlist_lookup_nvlist(d, pool_guid); } else { p = fnvlist_lookup_nvlist(d, pool_name); } nvdevs = fnvlist_alloc(); } collect_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE, nvdevs); if (cbp->cb_json) { fnvlist_add_nvlist(p, "vdevs", nvdevs); if (cbp->cb_json_pool_key_guid) fnvlist_add_nvlist(d, pool_guid, p); else fnvlist_add_nvlist(d, pool_name, p); fnvlist_add_nvlist(cbp->cb_jsobj, "pools", d); fnvlist_free(nvdevs); } } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * -j Display the output in JSON format * --json-int Display the numbers as integer instead of strings. * --json-pool-key-guid Set pool GUID as key for pool objects. * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; nvlist_t *data = NULL; current_prop_type = ZFS_TYPE_POOL; struct option long_options[] = { {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":gjHLo:pPT:v", long_options, NULL)) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'j': cb.cb_json = B_TRUE; break; case ZPOOL_OPTION_JSON_NUMS_AS_INT: cb.cb_json_as_int = B_TRUE; cb.cb_literal = B_TRUE; break; case ZPOOL_OPTION_POOL_KEY_GUID: cb.cb_json_pool_key_guid = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!cb.cb_json && cb.cb_json_as_int) { (void) fprintf(stderr, gettext("'--json-int' only works with" " '-j' option\n")); usage(B_FALSE); } if (!cb.cb_json && cb.cb_json_pool_key_guid) { (void) fprintf(stderr, gettext("'json-pool-key-guid' only" " works with '-j' option\n")); usage(B_FALSE); } get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, ZFS_TYPE_POOL, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); data = fnvlist_alloc(); fnvlist_add_nvlist(cb.cb_jsobj, "pools", data); fnvlist_free(data); } cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) { if (cb.cb_json) { if (cb.cb_json_as_int) { fnvlist_add_uint64(cb.cb_jsobj, "time", time(NULL)); } else { char ts[128]; get_timestamp(timestamp_fmt, ts, 128); fnvlist_add_string(cb.cb_jsobj, "time", ts); } } else print_timestamp(timestamp_fmt); } if (!cb.cb_scripted && (first || cb.cb_verbose) && !cb.cb_json) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (ret == 0 && cb.cb_json) zcmd_print_json(cb.cb_jsobj); else if (ret != 0 && cb.cb_json) nvlist_free(cb.cb_jsobj); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fflush(stdout); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && !cb.cb_json && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) { zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; char raidz_prefix[] = "raidz"; if (replacing) { activity = ZPOOL_WAIT_REPLACE; } else if (strncmp(old_disk, raidz_prefix, strlen(raidz_prefix)) == 0) { activity = ZPOOL_WAIT_RAIDZ_EXPAND; } ret = zpool_wait(zhp, activity); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] | * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering (mirror) or expansion (raidz) to complete * before returning. * * Attach to a or , where the vdev can be of type * mirror or raidz. If is not part of a mirror, then will * be transformed into a mirror of and . When a mirror * is involved, will begin life with a DTL of [0, now], and will * immediately begin to resilver itself. For the raidz case, a expansion will * commence and reflow the raidz data across all the disks including the * . */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; int ms_status = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { ms_status = zpool_enable_datasets(zhp, mntopts, 0, mount_tp_nthr); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Split was successful, " "datasets are mounted but sharing of some datasets " "has failed\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Split was successful" ", but some datasets could not be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online [--power] ... * * --power: Power on the enclosure slot to the drive (if possible) */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; boolean_t is_power_on = B_FALSE; struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "e", long_options, NULL)) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) is_power_on = B_TRUE; argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; int rc; if (is_power_on) { rc = zpool_power_on_and_disk_wait(zhp, argv[i]); if (rc == ENOTSUP) { (void) fprintf(stderr, gettext("Power control not supported\n")); } if (rc != 0) return (rc); } nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { ret = 1; continue; } uint_t vsc; oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); if ((flags & ZFS_ONLINE_EXPAND)) { (void) printf(gettext("%s: failed " "to expand usable space on " "unhealthy device '%s'\n"), (oldstate >= VDEV_STATE_DEGRADED ? "error" : "warning"), argv[i]); if (oldstate >= VDEV_STATE_DEGRADED) { ret = 1; break; } } } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft]|[--power] ... * * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. * * --power Power off the enclosure slot to the drive (if possible) */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; boolean_t is_power_off = B_FALSE; struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "ft", long_options, NULL)) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case ZPOOL_OPTION_POWER: is_power_off = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (is_power_off && fault) { (void) fprintf(stderr, gettext("-0 and -f cannot be used together\n")); usage(B_FALSE); return (1); } if (is_power_off && istmp) { (void) fprintf(stderr, gettext("-0 and -t cannot be used together\n")); usage(B_FALSE); return (1); } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); if (is_power_off) { /* * Note: we have to power off first, then set REMOVED, * or else zpool_vdev_set_removed_state() returns * EAGAIN. */ ret = zpool_power_off(zhp, argv[i]); if (ret != 0) { (void) fprintf(stderr, "%s %s %d\n", gettext("unable to power off slot for"), argv[i], ret); } zpool_vdev_set_removed_state(zhp, guid, VDEV_AUX_NONE); } else if (fault) { vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [-nF]|[--power] [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t is_power_on = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "FnX", long_options, NULL)) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) is_power_on = B_TRUE; argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (is_power_on) { if (device == NULL) { zpool_power_on_pool_and_wait_for_devices(zhp); } else { zpool_power_on_and_disk_wait(zhp, device); } } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid [-g ] */ int zpool_do_reguid(int argc, char **argv) { uint64_t guid; uint64_t *guidp = NULL; int c; char *endptr; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "g:")) != -1) { switch (c) { case 'g': errno = 0; guid = strtoull(optarg, &endptr, 10); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid GUID: %s\n"), optarg); usage(B_FALSE); } guidp = &guid; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_set_guid(zhp, guidp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] [-e] ... * * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; boolean_t is_stop = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "spwe")) != -1) { switch (c) { case 'e': is_error_scrub = B_TRUE; break; case 's': is_stop = B_TRUE; break; case 'p': is_pause = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (is_pause && is_stop) { (void) fprintf(stderr, gettext("invalid option " "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); } else { if (is_error_scrub) cb.cb_type = POOL_SCAN_ERRORSCRUB; if (is_pause) { cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; } else if (is_stop) { cb.cb_type = POOL_SCAN_NONE; } else { cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; } } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } if (zfs_nicestrtonum(g_zfs, optarg, &rate) == -1) { (void) fprintf(stderr, "%s: %s\n", gettext("invalid value for rate"), libzfs_error_description(g_zfs)); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed error scrub status. */ static void print_err_scrub_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t total_secs_left; uint64_t secs_left, mins_left, hours_left, days_left; uint64_t examined, to_be_examined; if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) { return; } (void) printf(gettext(" scrub: ")); start = ps->pss_error_scrub_start; end = ps->pss_error_scrub_end; pause = ps->pss_pass_error_scrub_pause; examined = ps->pss_error_scrub_examined; to_be_examined = ps->pss_error_scrub_to_be_examined; assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); if (ps->pss_error_scrub_state == DSS_FINISHED) { total_secs_left = end - start; days_left = total_secs_left / 60 / 60 / 24; hours_left = (total_secs_left / 60 / 60) % 24; mins_left = (total_secs_left / 60) % 60; secs_left = (total_secs_left % 60); (void) printf(gettext("scrubbed %llu error blocks in %llu days " "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined, (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, ctime(&end)); return; } else if (ps->pss_error_scrub_state == DSS_CANCELED) { (void) printf(gettext("error scrub canceled on %s"), ctime(&end)); return; } assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING); /* Error scrub is in progress. */ if (pause == 0) { (void) printf(gettext("error scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("error scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\terror scrub started on %s"), ctime(&start)); } double fraction_done = (double)examined / (to_be_examined + examined); (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error" " blocks"), 100 * fraction_done, (u_longlong_t)examined); (void) printf("\n"); } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); int is_resilver = ps->pss_func == POOL_SCAN_RESILVER; int is_scrub = ps->pss_func == POOL_SCAN_SCRUB; assert(is_resilver || is_scrub); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (is_scrub) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (is_scrub) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (is_scrub) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (is_resilver) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total_s = ps->pss_to_examine; total_i = ps->pss_to_examine - ps->pss_skipped; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total_i; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); /* do not print estimated time if we have a paused scrub */ (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); if (pause == 0 && scan_rate > 0) { zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); (void) printf(gettext(" at %s/s"), srate_buf); } (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); if (pause == 0 && issue_rate > 0) { zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); (void) printf(gettext(" at %s/s"), irate_buf); } (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (is_scrub) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { /* * Only provide an estimate iff: * 1) we haven't yet issued all we expected, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est_s = vrs->vrs_bytes_est; uint64_t bytes_est_i = vrs->vrs_bytes_est; if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) bytes_est_i -= vrs->vrs_pass_bytes_skipped; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est_s + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, bytes_est_s_buf); if (scan_rate > 0) { zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); (void) printf(gettext(" at %s/s"), scan_rate_buf); } (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, bytes_est_i_buf); if (issue_rate > 0) { zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); (void) printf(gettext(" at %s/s"), issue_rate_buf); } (void) printf(gettext("\n")); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (bytes_est_s >= bytes_scanned && scan_rate >= 10 * 1024 * 1024) { secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, i, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } static void vdev_stats_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, int depth, boolean_t isspare, char *parent, nvlist_t *item) { nvlist_t *vds, **child, *ch = NULL; uint_t vsc, children; vdev_stat_t *vs; char *vname; uint64_t notpresent; const char *type, *path; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; if (cb->cb_print_unhealthy && depth > 0 && for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { return; } vname = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags | VDEV_NAME_TYPE_ID); vds = fnvlist_alloc(); fill_vdev_info(vds, zhp, vname, B_FALSE, cb->cb_json_as_int); if (cb->cb_flat_vdevs && parent != NULL) { fnvlist_add_string(vds, "parent", parent); } if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) { fnvlist_add_string(vds, "state", "INUSE"); used_by_other(zhp, nv, vds); } else if (vs->vs_state == VDEV_STATE_HEALTHY) fnvlist_add_string(vds, "state", "AVAIL"); } else { if (vs->vs_alloc) { nice_num_str_nvlist(vds, "alloc_space", vs->vs_alloc, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_space) { nice_num_str_nvlist(vds, "total_space", vs->vs_space, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_dspace) { nice_num_str_nvlist(vds, "def_space", vs->vs_dspace, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_rsize) { nice_num_str_nvlist(vds, "rep_dev_size", vs->vs_rsize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_esize) { nice_num_str_nvlist(vds, "ex_dev_size", vs->vs_esize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_self_healed) { nice_num_str_nvlist(vds, "self_healed", vs->vs_self_healed, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_pspace) { nice_num_str_nvlist(vds, "phys_space", vs->vs_pspace, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } nice_num_str_nvlist(vds, "read_errors", vs->vs_read_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(vds, "write_errors", vs->vs_write_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(vds, "checksum_errors", vs->vs_checksum_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); if (vs->vs_scan_processed) { nice_num_str_nvlist(vds, "scan_processed", vs->vs_scan_processed, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_checkpoint_space) { nice_num_str_nvlist(vds, "checkpoint_space", vs->vs_checkpoint_space, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } if (vs->vs_resilver_deferred) { nice_num_str_nvlist(vds, "resilver_deferred", vs->vs_resilver_deferred, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } if (children == 0) { nice_num_str_nvlist(vds, "slow_ios", vs->vs_slow_ios, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); } if (cb->cb_print_power) { if (children == 0) { /* Only leaf vdevs have physical slots */ switch (zpool_power_current_state(zhp, (char *) fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH))) { case 0: fnvlist_add_string(vds, "power_state", "off"); break; case 1: fnvlist_add_string(vds, "power_state", "on"); break; default: fnvlist_add_string(vds, "power_state", "-"); } } else { fnvlist_add_string(vds, "power_state", "-"); } } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { nice_num_str_nvlist(vds, ZPOOL_CONFIG_NOT_PRESENT, 1, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_BYTES); fnvlist_add_string(vds, "was", fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH)); } else if (vs->vs_aux != VDEV_AUX_NONE) { fnvlist_add_string(vds, "aux", vdev_aux_str[vs->vs_aux]); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { nice_num_str_nvlist(vds, "configured_ashift", vs->vs_configured_ashift, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(vds, "physical_ashift", vs->vs_physical_ashift, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } if (vs->vs_scan_removing != 0) { nice_num_str_nvlist(vds, "removing", vs->vs_scan_removing, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } else if (VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) { nice_num_str_nvlist(vds, "noalloc", vs->vs_noalloc, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { zpool_nvlist_cmd(cb->vcdl, zpool_get_name(zhp), path, vds); } } if (children == 0) { if (cb->cb_print_vdev_init) { if (vs->vs_initialize_state != 0) { uint64_t st = vs->vs_initialize_state; fnvlist_add_string(vds, "init_state", vdev_init_state_str[st]); nice_num_str_nvlist(vds, "initialized", vs->vs_initialize_bytes_done, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(vds, "to_initialize", vs->vs_initialize_bytes_est, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(vds, "init_time", vs->vs_initialize_action_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(vds, "init_errors", vs->vs_initialize_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); } else { fnvlist_add_string(vds, "init_state", "UNINITIALIZED"); } } if (cb->cb_print_vdev_trim) { if (vs->vs_trim_notsup == 0) { if (vs->vs_trim_state != 0) { uint64_t st = vs->vs_trim_state; fnvlist_add_string(vds, "trim_state", vdev_trim_state_str[st]); nice_num_str_nvlist(vds, "trimmed", vs->vs_trim_bytes_done, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(vds, "to_trim", vs->vs_trim_bytes_est, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(vds, "trim_time", vs->vs_trim_action_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(vds, "trim_errors", vs->vs_trim_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); } else fnvlist_add_string(vds, "trim_state", "UNTRIMMED"); } nice_num_str_nvlist(vds, "trim_notsup", vs->vs_trim_notsup, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } } else { ch = fnvlist_alloc(); } if (cb->cb_flat_vdevs && children == 0) { fnvlist_add_nvlist(item, vname, vds); } for (int c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; if (cb->cb_flat_vdevs) { vdev_stats_nvlist(zhp, cb, child[c], depth + 2, isspare, vname, item); } vdev_stats_nvlist(zhp, cb, child[c], depth + 2, isspare, vname, ch); } if (ch != NULL) { if (!nvlist_empty(ch)) fnvlist_add_nvlist(vds, "vdevs", ch); fnvlist_free(ch); } fnvlist_add_nvlist(item, vname, vds); fnvlist_free(vds); free(vname); } static void class_vdevs_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class, nvlist_t *item) { uint_t c, children; nvlist_t **child; nvlist_t *class_obj = NULL; if (!cb->cb_flat_vdevs) class_obj = fnvlist_alloc(); assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; const char *bias = NULL; const char *type = NULL; char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = (char *)VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (cb->cb_flat_vdevs) { vdev_stats_nvlist(zhp, cb, child[c], 2, B_FALSE, NULL, item); } else { vdev_stats_nvlist(zhp, cb, child[c], 2, B_FALSE, NULL, class_obj); } free(name); } if (!cb->cb_flat_vdevs) { if (!nvlist_empty(class_obj)) fnvlist_add_nvlist(item, class, class_obj); fnvlist_free(class_obj); } } static void l2cache_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, nvlist_t *item) { nvlist_t *l2c = NULL, **l2cache; uint_t nl2cache; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (nl2cache == 0) return; if (!cb->cb_flat_vdevs) l2c = fnvlist_alloc(); for (int i = 0; i < nl2cache; i++) { if (cb->cb_flat_vdevs) { vdev_stats_nvlist(zhp, cb, l2cache[i], 2, B_FALSE, NULL, item); } else { vdev_stats_nvlist(zhp, cb, l2cache[i], 2, B_FALSE, NULL, l2c); } } } if (!cb->cb_flat_vdevs) { if (!nvlist_empty(l2c)) fnvlist_add_nvlist(item, "l2cache", l2c); fnvlist_free(l2c); } } static void spares_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, nvlist_t *item) { nvlist_t *sp = NULL, **spares; uint_t nspares; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (nspares == 0) return; if (!cb->cb_flat_vdevs) sp = fnvlist_alloc(); for (int i = 0; i < nspares; i++) { if (cb->cb_flat_vdevs) { vdev_stats_nvlist(zhp, cb, spares[i], 2, B_TRUE, NULL, item); } else { vdev_stats_nvlist(zhp, cb, spares[i], 2, B_TRUE, NULL, sp); } } } if (!cb->cb_flat_vdevs) { if (!nvlist_empty(sp)) fnvlist_add_nvlist(item, "spares", sp); fnvlist_free(sp); } } static void errors_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *item) { uint64_t nerr; nvlist_t *config = zpool_get_config(zhp, NULL); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nice_num_str_nvlist(item, ZPOOL_CONFIG_ERRCOUNT, nerr, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); if (nerr != 0 && cb->cb_verbose) { nvlist_t *nverrlist = NULL; if (zpool_get_errlog(zhp, &nverrlist) == 0) { int i = 0; int count = 0; size_t len = MAXPATHLEN * 2; nvpair_t *elem = NULL; for (nvpair_t *pair = nvlist_next_nvpair(nverrlist, NULL); pair != NULL; pair = nvlist_next_nvpair(nverrlist, pair)) count++; char **errl = (char **)malloc( count * sizeof (char *)); while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); errl[i] = safe_malloc(len); zpool_obj_to_path(zhp, dsobj, obj, errl[i++], len); } nvlist_free(nverrlist); fnvlist_add_string_array(item, "errlist", (const char **)errl, count); for (int i = 0; i < count; ++i) free(errl[i]); free(errl); } else fnvlist_add_string(item, "errlist", strerror(errno)); } } } static void ddt_stats_nvlist(ddt_stat_t *dds, status_cbdata_t *cb, nvlist_t *item) { nice_num_str_nvlist(item, "blocks", dds->dds_blocks, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(item, "logical_size", dds->dds_lsize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(item, "physical_size", dds->dds_psize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(item, "deflated_size", dds->dds_dsize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(item, "ref_blocks", dds->dds_ref_blocks, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(item, "ref_lsize", dds->dds_ref_lsize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(item, "ref_psize", dds->dds_ref_psize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(item, "ref_dsize", dds->dds_ref_dsize, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); } static void dedup_stats_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *item) { nvlist_t *config; if (cb->cb_dedup_stats) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; nvlist_t *ddt_stat, *ddt_obj, *dedup; uint_t c; uint64_t cspace_prop; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; dedup = fnvlist_alloc(); ddt_obj = fnvlist_alloc(); nice_num_str_nvlist(dedup, "obj_count", ddo->ddo_count, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); if (ddo->ddo_count == 0) { fnvlist_add_nvlist(dedup, ZPOOL_CONFIG_DDT_OBJ_STATS, ddt_obj); fnvlist_add_nvlist(item, "dedup_stats", dedup); fnvlist_free(ddt_obj); fnvlist_free(dedup); return; } else { nice_num_str_nvlist(dedup, "dspace", ddo->ddo_dspace, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(dedup, "mspace", ddo->ddo_mspace, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); /* * Squash cached size into in-core size to handle race. * Only include cached size if it is available. */ cspace_prop = zpool_get_prop_int(zhp, ZPOOL_PROP_DEDUPCACHED, NULL); cspace_prop = MIN(cspace_prop, ddo->ddo_mspace); nice_num_str_nvlist(dedup, "cspace", cspace_prop, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); } ddt_stat = fnvlist_alloc(); if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0) { nvlist_t *total = fnvlist_alloc(); if (dds->dds_blocks == 0) fnvlist_add_string(total, "blocks", "0"); else ddt_stats_nvlist(dds, cb, total); fnvlist_add_nvlist(ddt_stat, "total", total); fnvlist_free(total); } if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0) { nvlist_t *hist = fnvlist_alloc(); nvlist_t *entry = NULL; char buf[16]; for (int h = 0; h < 64; h++) { if (ddh->ddh_stat[h].dds_blocks != 0) { entry = fnvlist_alloc(); ddt_stats_nvlist(&ddh->ddh_stat[h], cb, entry); snprintf(buf, 16, "%d", h); fnvlist_add_nvlist(hist, buf, entry); fnvlist_free(entry); } } if (!nvlist_empty(hist)) fnvlist_add_nvlist(ddt_stat, "histogram", hist); fnvlist_free(hist); } if (!nvlist_empty(ddt_obj)) { fnvlist_add_nvlist(dedup, ZPOOL_CONFIG_DDT_OBJ_STATS, ddt_obj); } fnvlist_free(ddt_obj); if (!nvlist_empty(ddt_stat)) { fnvlist_add_nvlist(dedup, ZPOOL_CONFIG_DDT_STATS, ddt_stat); } fnvlist_free(ddt_stat); if (!nvlist_empty(dedup)) fnvlist_add_nvlist(item, "dedup_stats", dedup); fnvlist_free(dedup); } } static void raidz_expand_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nvroot, nvlist_t *item) { uint_t c; pool_raidz_expand_stat_t *pres = NULL; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c) == 0) { nvlist_t **child; uint_t children; nvlist_t *nv = fnvlist_alloc(); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(pres->pres_expanding_vdev < children); char *name = zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0); fill_vdev_info(nv, zhp, name, B_FALSE, cb->cb_json_as_int); fnvlist_add_string(nv, "state", pool_scan_state_str[pres->pres_state]); nice_num_str_nvlist(nv, "expanding_vdev", pres->pres_expanding_vdev, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(nv, "start_time", pres->pres_start_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "end_time", pres->pres_end_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "to_reflow", pres->pres_to_reflow, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "reflowed", pres->pres_reflowed, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "waiting_for_resilver", pres->pres_waiting_for_resilver, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); fnvlist_add_nvlist(item, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, nv); fnvlist_free(nv); free(name); } } static void checkpoint_status_nvlist(nvlist_t *nvroot, status_cbdata_t *cb, nvlist_t *item) { uint_t c; pool_checkpoint_stat_t *pcs = NULL; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c) == 0) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_string(nv, "state", checkpoint_state_str[pcs->pcs_state]); nice_num_str_nvlist(nv, "start_time", pcs->pcs_start_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "space", pcs->pcs_space, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); fnvlist_add_nvlist(item, ZPOOL_CONFIG_CHECKPOINT_STATS, nv); fnvlist_free(nv); } } static void removal_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nvroot, nvlist_t *item) { uint_t c; pool_removal_stat_t *prs = NULL; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c) == 0) { if (prs->prs_state != DSS_NONE) { nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); char *vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); nvlist_t *nv = fnvlist_alloc(); fill_vdev_info(nv, zhp, vdev_name, B_FALSE, cb->cb_json_as_int); fnvlist_add_string(nv, "state", pool_scan_state_str[prs->prs_state]); nice_num_str_nvlist(nv, "removing_vdev", prs->prs_removing_vdev, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(nv, "start_time", prs->prs_start_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "end_time", prs->prs_end_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "to_copy", prs->prs_to_copy, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "copied", prs->prs_copied, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "mapping_memory", prs->prs_mapping_memory, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); fnvlist_add_nvlist(item, ZPOOL_CONFIG_REMOVAL_STATS, nv); fnvlist_free(nv); free(vdev_name); } } } static void scan_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nvroot, nvlist_t *item) { pool_scan_stat_t *ps = NULL; uint_t c; nvlist_t *scan = fnvlist_alloc(); nvlist_t **child; uint_t children; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { fnvlist_add_string(scan, "function", pool_scan_func_str[ps->pss_func]); fnvlist_add_string(scan, "state", pool_scan_state_str[ps->pss_state]); nice_num_str_nvlist(scan, "start_time", ps->pss_start_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(scan, "end_time", ps->pss_end_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(scan, "to_examine", ps->pss_to_examine, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "examined", ps->pss_examined, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "skipped", ps->pss_skipped, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "processed", ps->pss_processed, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "errors", ps->pss_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(scan, "bytes_per_scan", ps->pss_pass_exam, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "pass_start", ps->pss_pass_start, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(scan, "scrub_pause", ps->pss_pass_scrub_pause, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(scan, "scrub_spent_paused", ps->pss_pass_scrub_spent_paused, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(scan, "issued_bytes_per_scan", ps->pss_pass_issued, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(scan, "issued", ps->pss_issued, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); if (ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB && ps->pss_error_scrub_start > ps->pss_start_time) { fnvlist_add_string(scan, "err_scrub_func", pool_scan_func_str[ps->pss_error_scrub_func]); fnvlist_add_string(scan, "err_scrub_state", pool_scan_state_str[ps->pss_error_scrub_state]); nice_num_str_nvlist(scan, "err_scrub_start_time", ps->pss_error_scrub_start, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(scan, "err_scrub_end_time", ps->pss_error_scrub_end, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(scan, "err_scrub_examined", ps->pss_error_scrub_examined, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(scan, "err_scrub_to_examine", ps->pss_error_scrub_to_be_examined, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(scan, "err_scrub_pause", ps->pss_pass_error_scrub_pause, B_TRUE, cb->cb_json_as_int, ZFS_NICENUM_1024); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { vdev_rebuild_stat_t *vrs; uint_t i; char *name; nvlist_t *nv; nvlist_t *rebuild = fnvlist_alloc(); uint64_t st; for (uint_t c = 0; c < children; c++) { if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_state != VDEV_REBUILD_NONE) { nv = fnvlist_alloc(); name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); fill_vdev_info(nv, zhp, name, B_FALSE, cb->cb_json_as_int); st = vrs->vrs_state; fnvlist_add_string(nv, "state", vdev_rebuild_state_str[st]); nice_num_str_nvlist(nv, "start_time", vrs->vrs_start_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "end_time", vrs->vrs_end_time, cb->cb_literal, cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); nice_num_str_nvlist(nv, "scan_time", vrs->vrs_scan_time_ms * 1000000, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_TIME); nice_num_str_nvlist(nv, "scanned", vrs->vrs_bytes_scanned, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "issued", vrs->vrs_bytes_issued, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "rebuilt", vrs->vrs_bytes_rebuilt, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "to_scan", vrs->vrs_bytes_est, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "errors", vrs->vrs_errors, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); nice_num_str_nvlist(nv, "pass_time", vrs->vrs_pass_time_ms * 1000000, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_TIME); nice_num_str_nvlist(nv, "pass_scanned", vrs->vrs_pass_bytes_scanned, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "pass_issued", vrs->vrs_pass_bytes_issued, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); nice_num_str_nvlist(nv, "pass_skipped", vrs->vrs_pass_bytes_skipped, cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_BYTES); fnvlist_add_nvlist(rebuild, name, nv); free(name); } } } if (!nvlist_empty(rebuild)) fnvlist_add_nvlist(scan, "rebuild_stats", rebuild); fnvlist_free(rebuild); } if (!nvlist_empty(scan)) fnvlist_add_nvlist(item, ZPOOL_CONFIG_SCAN_STATS, scan); fnvlist_free(scan); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t have_errorscrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; time_t scrub_start = 0, errorscrub_start = 0; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; if (c > offsetof(pool_scan_stat_t, pss_pass_error_scrub_pause) / 8) { have_errorscrub = (ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); errorscrub_start = ps->pss_error_scrub_start; } } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub && scrub_start > errorscrub_start) print_scan_scrub_resilver_status(ps); else if (have_errorscrub && errorscrub_start >= scrub_start) print_err_scrub_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext( "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext( "\t%s memory used for removed device mappings\n"), mem_buf); } } /* * Print out detailed raidz expansion status. */ static void print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) { char copied_buf[7]; if (pres == NULL || pres->pres_state == DSS_NONE) return; /* * Determine name of vdev. */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(pres->pres_expanding_vdev < children); printf_color(ANSI_BOLD, gettext("expand: ")); time_t start = pres->pres_start_time; time_t end = pres->pres_end_time; char *vname = zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0); zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); /* * Expansion is finished or canceled. */ if (pres->pres_state == DSS_FINISHED) { char time_buf[32]; secs_to_dhms(end - start, time_buf); (void) printf(gettext("expanded %s-%u copied %s in %s, " "on %s"), vname, (int)pres->pres_expanding_vdev, copied_buf, time_buf, ctime((time_t *)&end)); } else { char examined_buf[7], total_buf[7], rate_buf[7]; uint64_t copied, total, elapsed, secs_left; double fraction_done; uint_t rate; assert(pres->pres_state == DSS_SCANNING); /* * Expansion is in progress. */ (void) printf(gettext( "expansion of %s-%u in progress since %s"), vname, (int)pres->pres_expanding_vdev, ctime(&start)); copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; total = pres->pres_to_reflow; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - pres->pres_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; secs_left = (total - copied) / rate; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (pres->pres_waiting_for_resilver) { (void) printf(gettext(", paused for resilver or " "clear\n")); } else if (secs_left < (30 * 24 * 3600)) { char time_buf[32]; secs_to_dhms(secs_left, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vname); } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(zpool_handle_t *zhp, nvlist_t *config, boolean_t literal) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; /* Extra space provided for literal display */ char dspace[32], mspace[32], cspace[32]; uint64_t cspace_prop; enum zfs_nicenum_format format; zprop_source_t src; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } /* * Squash cached size into in-core size to handle race. * Only include cached size if it is available. */ cspace_prop = zpool_get_prop_int(zhp, ZPOOL_PROP_DEDUPCACHED, &src); cspace_prop = MIN(cspace_prop, ddo->ddo_mspace); format = literal ? ZFS_NICENUM_RAW : ZFS_NICENUM_1024; zfs_nicenum_format(cspace_prop, cspace, sizeof (cspace), format); zfs_nicenum_format(ddo->ddo_dspace, dspace, sizeof (dspace), format); zfs_nicenum_format(ddo->ddo_mspace, mspace, sizeof (mspace), format); (void) printf("DDT entries %llu, size %s on disk, %s in core", (u_longlong_t)ddo->ddo_count, dspace, mspace); if (src != ZPROP_SRC_DEFAULT) { (void) printf(", %s cached (%.02f%%)", cspace, (double)cspace_prop / (double)ddo->ddo_mspace * 100.0); } (void) printf("\n"); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } #define ST_SIZE 4096 #define AC_SIZE 2048 static void print_status_reason(zpool_handle_t *zhp, status_cbdata_t *cbp, zpool_status_t reason, zpool_errata_t errata, nvlist_t *item) { char status[ST_SIZE]; char action[AC_SIZE]; memset(status, 0, ST_SIZE); memset(action, 0, AC_SIZE); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: snprintf(status, ST_SIZE, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); snprintf(action, AC_SIZE, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: snprintf(status, ST_SIZE, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); snprintf(action, AC_SIZE, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: snprintf(status, ST_SIZE, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); snprintf(action, AC_SIZE, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: snprintf(status, ST_SIZE, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, zpool_get_config(zhp, NULL), action, AC_SIZE); break; case ZPOOL_STATUS_FAILING_DEV: snprintf(status, ST_SIZE, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); snprintf(action, AC_SIZE, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: snprintf(status, ST_SIZE, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); snprintf(action, AC_SIZE, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: snprintf(status, ST_SIZE, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); snprintf(action, AC_SIZE, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: snprintf(status, ST_SIZE, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); snprintf(action, AC_SIZE, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: snprintf(status, ST_SIZE, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); snprintf(action, AC_SIZE, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: snprintf(status, ST_SIZE, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); snprintf(action, AC_SIZE, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: snprintf(status, ST_SIZE, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, zpool_get_config(zhp, NULL), action, AC_SIZE); break; case ZPOOL_STATUS_VERSION_OLDER: snprintf(status, ST_SIZE, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); snprintf(action, AC_SIZE, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: snprintf(status, ST_SIZE, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); snprintf(action, AC_SIZE, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: snprintf(status, ST_SIZE, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); snprintf(action, AC_SIZE, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(7) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: snprintf(status, ST_SIZE, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); snprintf(action, AC_SIZE, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: snprintf(status, ST_SIZE, gettext("One or more features " "are enabled on the pool despite not being\n\t" "requested by the 'compatibility' property.\n")); snprintf(action, AC_SIZE, gettext("Consider setting " "'compatibility' to an appropriate value, or\n\t" "adding needed features to the relevant file in\n\t" ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: snprintf(status, ST_SIZE, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_collect_unsup_feat(zpool_get_config(zhp, NULL), status, 1024); snprintf(action, AC_SIZE, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: snprintf(status, ST_SIZE, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_collect_unsup_feat(zpool_get_config(zhp, NULL), status, 1024); snprintf(action, AC_SIZE, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: snprintf(status, ST_SIZE, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); snprintf(action, AC_SIZE, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: snprintf(status, ST_SIZE, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); snprintf(action, AC_SIZE, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: snprintf(status, ST_SIZE, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); snprintf(action, AC_SIZE, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: snprintf(status, ST_SIZE, gettext("One or more devices are " "faulted in response to IO failures.\n")); snprintf(action, AC_SIZE, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: snprintf(status, ST_SIZE, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); snprintf(action, AC_SIZE, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: snprintf(status, ST_SIZE, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); snprintf(action, AC_SIZE, gettext("Replace affected devices " "with devices that support the\n\tconfigured block size, " "or migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: snprintf(status, ST_SIZE, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); snprintf(action, AC_SIZE, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: snprintf(status, ST_SIZE, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: snprintf(action, AC_SIZE, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) strlcat(status, gettext("\tExisting encrypted " "datasets contain an on-disk incompatibility\n\t " "which needs to be corrected.\n"), ST_SIZE); snprintf(action, AC_SIZE, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) strlcat(status, gettext("\tExisting encrypted " "snapshots and bookmarks contain an on-disk\n\t" "incompatibility. This may cause on-disk " "corruption if they are used\n\twith " "'zfs recv'.\n"), ST_SIZE); snprintf(action, AC_SIZE, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (status[0] != 0) { if (cbp->cb_json) fnvlist_add_string(item, "status", status); else { printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, status); } } if (action[0] != 0) { if (cbp->cb_json) fnvlist_add_string(item, "action", action); else { printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, action); } } } static int status_callback_json(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; const char *msgid; char pool_guid[256]; char msgbuf[256]; uint64_t guid; zpool_status_t reason; zpool_errata_t errata; uint_t c; vdev_stat_t *vs; nvlist_t *item, *d, *load_info, *vds; item = d = NULL; /* If dedup stats were requested, also fetch dedupcached. */ if (cbp->cb_dedup_stats > 1) zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); reason = zpool_get_status(zhp, &msgid, &errata); /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR || reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { return (0); } d = fnvlist_lookup_nvlist(cbp->cb_jsobj, "pools"); item = fnvlist_alloc(); vds = fnvlist_alloc(); fill_pool_info(item, zhp, B_FALSE, cbp->cb_json_as_int); config = zpool_get_config(zhp, NULL); if (config != NULL) { nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (cbp->cb_json_pool_key_guid) { guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); snprintf(pool_guid, 256, "%llu", (u_longlong_t)guid); } cbp->cb_count++; print_status_reason(zhp, cbp, reason, errata, item); if (msgid != NULL) { snprintf(msgbuf, 256, "https://openzfs.github.io/openzfs-docs/msg/%s", msgid); fnvlist_add_string(item, "msgid", msgid); fnvlist_add_string(item, "moreinfo", msgbuf); } if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &load_info) == 0) { fnvlist_add_nvlist(item, ZPOOL_CONFIG_LOAD_INFO, load_info); } scan_status_nvlist(zhp, cbp, nvroot, item); removal_status_nvlist(zhp, cbp, nvroot, item); checkpoint_status_nvlist(nvroot, cbp, item); raidz_expand_status_nvlist(zhp, cbp, nvroot, item); vdev_stats_nvlist(zhp, cbp, nvroot, 0, B_FALSE, NULL, vds); if (cbp->cb_flat_vdevs) { class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP, vds); class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL, vds); class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS, vds); l2cache_nvlist(zhp, cbp, nvroot, vds); spares_nvlist(zhp, cbp, nvroot, vds); fnvlist_add_nvlist(item, "vdevs", vds); fnvlist_free(vds); } else { fnvlist_add_nvlist(item, "vdevs", vds); fnvlist_free(vds); class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP, item); class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL, item); class_vdevs_nvlist(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS, item); l2cache_nvlist(zhp, cbp, nvroot, item); spares_nvlist(zhp, cbp, nvroot, item); } dedup_stats_nvlist(zhp, cbp, item); errors_nvlist(zhp, cbp, item); } if (cbp->cb_json_pool_key_guid) { fnvlist_add_nvlist(d, pool_guid, item); } else { fnvlist_add_nvlist(d, zpool_get_name(zhp), item); } fnvlist_free(item); return (0); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; const char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; /* If dedup stats were requested, also fetch dedupcached. */ if (cbp->cb_dedup_stats > 1) zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR || reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); fputc(' ', stdout); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); fputc('\n', stdout); print_status_reason(zhp, cbp, reason, errata, NULL); if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; print_scan_status(zhp, nvroot); pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); pool_raidz_expand_stat_t *pres = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); print_raidz_expand_status(zhp, pres); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->cb_print_power) { printf_color(ANSI_BOLD, " %5s", gettext("POWER")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { (void) printf("\n"); if (nerr == 0) { (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); color_end(); } else { print_error_log(zhp); } } if (cbp->cb_dedup_stats) print_dedup_stats(zhp, config, cbp->cb_literal); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ... * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD * -D Display dedup status (undocumented) * -e Display only unhealthy vdevs * -g Display guid for individual vdev name. * -i Display vdev initialization status. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * -v Display complete error logs * -x Display only pools with potential problems * -j Display output in JSON format * --power Display vdev enclosure slot power status * --json-int Display numbers in inteeger format instead of string * --json-flat-vdevs Display vdevs in flat hierarchy * --json-pool-key-guid Use pool GUID as key for pool objects * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; nvlist_t *data; char *cmd = NULL; struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-flat-vdevs", no_argument, NULL, ZPOOL_OPTION_JSON_FLAT_VDEVS}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "c:jDegiLpPstT:vx", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'D': if (++cb.cb_dedup_stats > 2) cb.cb_dedup_stats = 2; break; case 'e': cb.cb_print_unhealthy = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; break; case 'j': cb.cb_json = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case ZPOOL_OPTION_POWER: cb.cb_print_power = B_TRUE; break; case ZPOOL_OPTION_JSON_FLAT_VDEVS: cb.cb_flat_vdevs = B_TRUE; break; case ZPOOL_OPTION_JSON_NUMS_AS_INT: cb.cb_json_as_int = B_TRUE; cb.cb_literal = B_TRUE; break; case ZPOOL_OPTION_POOL_KEY_GUID: cb.cb_json_pool_key_guid = B_TRUE; break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; if (cb.cb_flat_vdevs && !cb.cb_json) { fprintf(stderr, gettext("'--json-flat-vdevs' only works with" " '-j' option\n")); usage(B_FALSE); } if (cb.cb_json_as_int && !cb.cb_json) { (void) fprintf(stderr, gettext("'--json-int' only works with" " '-j' option\n")); usage(B_FALSE); } if (!cb.cb_json && cb.cb_json_pool_key_guid) { (void) fprintf(stderr, gettext("'json-pool-key-guid' only" " works with '-j' option\n")); usage(B_FALSE); } for (;;) { if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); data = fnvlist_alloc(); fnvlist_add_nvlist(cb.cb_jsobj, "pools", data); fnvlist_free(data); } if (timestamp_fmt != NODATE) { if (cb.cb_json) { if (cb.cb_json_as_int) { fnvlist_add_uint64(cb.cb_jsobj, "time", time(NULL)); } else { char ts[128]; get_timestamp(timestamp_fmt, ts, 128); fnvlist_add_string(cb.cb_jsobj, "time", ts); } } else print_timestamp(timestamp_fmt); } if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); if (cb.cb_json) { ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, cb.cb_literal, status_callback_json, &cb); } else { ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, cb.cb_literal, status_callback, &cb); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (cb.cb_json) { if (ret == 0) zcmd_print_json(cb.cb_jsobj); else nvlist_free(cb.cb_jsobj); } else { if (argc == 0 && cb.cb_count == 0) { (void) fprintf(stderr, "%s", gettext("no pools available\n")); } else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) { (void) printf("%s", gettext("all pools are healthy\n")); } } if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fflush(stdout); (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems_v2(zhp, 0, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { (void) fprintf(stderr, gettext("Upgrade not performed because " "'compatibility' property set to '" ZPOOL_COMPAT_LEGACY "'.\n")); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t modified_pool = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); modified_pool = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; modified_pool = B_TRUE; } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n" "Note that setting a pool's 'compatibility' " "feature to '" ZPOOL_COMPAT_LEGACY "' will\n" "inhibit upgrades.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(7) for " "details.\n\n" "Note that the pool " "'compatibility' feature can be " "used to inhibit\nfeature " "upgrades.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t modified_pool = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { modified_pool = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { modified_pool = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf("%s", gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; if (!fi->fi_zfs_mod_supported) continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32]; const char *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; const char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { const char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; const char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(void) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback_vdev(zpool_handle_t *zhp, char *vdevname, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; nvlist_t *props, *item, *d; props = item = d = NULL; if (cbp->cb_json) { d = fnvlist_lookup_nvlist(cbp->cb_jsobj, "vdevs"); if (d == NULL) { fprintf(stderr, "vdevs obj not found.\n"); exit(1); } props = fnvlist_alloc(); } for (zprop_list_t *pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { char *prop_name; /* * If the first property is pool name, it is a special * placeholder that we can skip. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL) { prop_name = pl->pl_user_prop; } else { prop_name = (char *)vdev_prop_to_name(pl->pl_prop); } if (zpool_get_vdev_prop(zhp, vdevname, pl->pl_prop, prop_name, value, sizeof (value), &srctype, cbp->cb_literal) == 0) { zprop_collect_property(vdevname, cbp, prop_name, value, srctype, NULL, NULL, props); } } if (cbp->cb_json) { if (!nvlist_empty(props)) { item = fnvlist_alloc(); fill_vdev_info(item, zhp, vdevname, B_TRUE, cbp->cb_json_as_int); fnvlist_add_nvlist(item, "properties", props); fnvlist_add_nvlist(d, vdevname, item); fnvlist_add_nvlist(cbp->cb_jsobj, "vdevs", d); fnvlist_free(item); } fnvlist_free(props); } return (0); } static int get_callback_vdev_cb(void *zhp_data, nvlist_t *nv, void *data) { zpool_handle_t *zhp = zhp_data; zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char *vdevname; const char *type; int ret; /* * zpool_vdev_name() transforms the root vdev name (i.e., root-0) to the * pool name for display purposes, which is not desired. Fallback to * zpool_vdev_name() when not dealing with the root vdev. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) vdevname = strdup("root-0"); else vdevname = zpool_vdev_name(g_zfs, zhp, nv, cbp->cb_vdevs.cb_name_flags); (void) vdev_expand_proplist(zhp, vdevname, &cbp->cb_proplist); ret = get_callback_vdev(zhp, vdevname, data); free(vdevname); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; zprop_list_t *pl; int vid; int err = 0; nvlist_t *props, *item, *d; props = item = d = NULL; if (cbp->cb_type == ZFS_TYPE_VDEV) { if (cbp->cb_json) { nvlist_t *pool = fnvlist_alloc(); fill_pool_info(pool, zhp, B_FALSE, cbp->cb_json_as_int); fnvlist_add_nvlist(cbp->cb_jsobj, "pool", pool); fnvlist_free(pool); } if (strcmp(cbp->cb_vdevs.cb_names[0], "all-vdevs") == 0) { for_each_vdev(zhp, get_callback_vdev_cb, data); } else { /* Adjust column widths for vdev properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { vdev_expand_proplist(zhp, cbp->cb_vdevs.cb_names[vid], &cbp->cb_proplist); } /* Display the properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { get_callback_vdev(zhp, cbp->cb_vdevs.cb_names[vid], data); } } } else { assert(cbp->cb_type == ZFS_TYPE_POOL); if (cbp->cb_json) { d = fnvlist_lookup_nvlist(cbp->cb_jsobj, "pools"); if (d == NULL) { fprintf(stderr, "pools obj not found.\n"); exit(1); } props = fnvlist_alloc(); } for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also * skip over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && zfs_prop_user(pl->pl_user_prop)) { srctype = ZPROP_SRC_LOCAL; if (zpool_get_userprop(zhp, pl->pl_user_prop, value, sizeof (value), &srctype) != 0) continue; err = zprop_collect_property( zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL, props); } else if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { err = zprop_collect_property( zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL, props); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; err = zprop_collect_property( zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL, props); } if (err != 0) return (err); } if (cbp->cb_json) { if (!nvlist_empty(props)) { item = fnvlist_alloc(); fill_pool_info(item, zhp, B_TRUE, cbp->cb_json_as_int); fnvlist_add_nvlist(item, "properties", props); if (cbp->cb_json_pool_key_guid) { char buf[256]; uint64_t guid = fnvlist_lookup_uint64( zpool_get_config(zhp, NULL), ZPOOL_CONFIG_POOL_GUID); snprintf(buf, 256, "%llu", (u_longlong_t)guid); fnvlist_add_nvlist(d, buf, item); } else { const char *name = zpool_get_name(zhp); fnvlist_add_nvlist(d, name, item); } fnvlist_add_nvlist(cbp->cb_jsobj, "pools", d); fnvlist_free(item); } fnvlist_free(props); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * -j Display output in JSON format. * --json-int Display numbers as integers instead of strings. * --json-pool-key-guid Set pool GUID as key for pool objects. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *propstr = NULL; char *vdev = NULL; nvlist_t *data = NULL; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; current_prop_type = cb.cb_type; struct option long_options[] = { {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":jHpo:", long_options, NULL)) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'j': cb.cb_json = B_TRUE; cb.cb_jsobj = zpool_json_schema(0, 1); data = fnvlist_alloc(); break; case ZPOOL_OPTION_POOL_KEY_GUID: cb.cb_json_pool_key_guid = B_TRUE; break; case ZPOOL_OPTION_JSON_NUMS_AS_INT: cb.cb_json_as_int = B_TRUE; cb.cb_literal = B_TRUE; break; case 'o': memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "name", "property", "value", "source", "all" }; static const zfs_get_column_t col_cols[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_SOURCE }; if (i == ZFS_GET_NCOLS - 1) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_opts); ++c) if (strcmp(tok, col_opts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 4) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_cols, sizeof (col_cols)); i = ZFS_GET_NCOLS - 1; } else cb.cb_columns[i++] = col_cols[c]; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!cb.cb_json && cb.cb_json_as_int) { (void) fprintf(stderr, gettext("'--json-int' only works with" " '-j' option\n")); usage(B_FALSE); } if (!cb.cb_json && cb.cb_json_pool_key_guid) { (void) fprintf(stderr, gettext("'json-pool-key-guid' only" " works with '-j' option\n")); usage(B_FALSE); } if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } /* Properties list is needed later by zprop_get_list() */ propstr = argv[0]; argc--; argv++; if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if ((argc == 2 && strcmp(argv[1], "all-vdevs") == 0) || (argc == 2 && strcmp(argv[1], "root") == 0) || are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); /* ... and the rest are vdev names */ cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = argc - 1; cb.cb_type = ZFS_TYPE_VDEV; argc = 1; /* One pool to process */ } else { if (cb.cb_json) { nvlist_free(cb.cb_jsobj); nvlist_free(data); } fprintf(stderr, gettext("Expected a list of vdevs in" " \"%s\", but got:\n"), argv[0]); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { if (cb.cb_json) { nvlist_free(cb.cb_jsobj); nvlist_free(data); } /* * The first arg isn't the name of a valid pool. */ fprintf(stderr, gettext("Cannot get properties of %s: " "no such pool available.\n"), argv[0]); return (1); } if (zprop_get_list(g_zfs, propstr, &cb.cb_proplist, cb.cb_type) != 0) { /* Use correct list of valid properties (pool or vdev) */ current_prop_type = cb.cb_type; usage(B_FALSE); } if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } if (cb.cb_json) { if (cb.cb_type == ZFS_TYPE_VDEV) fnvlist_add_nvlist(cb.cb_jsobj, "vdevs", data); else fnvlist_add_nvlist(cb.cb_jsobj, "pools", data); fnvlist_free(data); } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_type, cb.cb_literal, get_callback, &cb); if (ret == 0 && cb.cb_json) zcmd_print_json(cb.cb_jsobj); else if (ret != 0 && cb.cb_json) nvlist_free(cb.cb_jsobj); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); if (vdev != NULL) free(vdev); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; zfs_type_t cb_type; vdev_cbdata_t cb_vdevs; boolean_t cb_any_successful; } set_cbdata_t; static int set_pool_callback(zpool_handle_t *zhp, set_cbdata_t *cb) { int error; /* Check if we have out-of-bounds features */ if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(cb->cb_value, features) != ZPOOL_COMPATIBILITY_OK) return (-1); nvlist_t *enabled = zpool_get_features(zhp); spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; if (nvlist_exists(enabled, fguid) && !features[i]) break; } if (i < SPA_FEATURES) (void) fprintf(stderr, gettext("Warning: one or " "more features already enabled on pool '%s'\n" "are not present in this compatibility set.\n"), zpool_get_name(zhp)); } /* if we're setting a feature, check it's in compatibility set */ if (zpool_prop_feature(cb->cb_propname) && strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) { char *fname = strchr(cb->cb_propname, '@') + 1; spa_feature_t f; if (zfeature_lookup_name(fname, &f) == 0) { char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(compat, features) != ZPOOL_COMPATIBILITY_OK) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "because the pool's 'compatibility' " "property cannot be parsed.\n"), fname, zpool_get_name(zhp)); return (-1); } if (!features[f]) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "as it is not specified in this pool's " "current compatibility set.\n" "Consider setting 'compatibility' to a " "less restrictive set, or to 'off'.\n"), fname, zpool_get_name(zhp)); return (-1); } } } error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); return (error); } static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; if (cb->cb_type == ZFS_TYPE_VDEV) { error = zpool_set_vdev_prop(zhp, *cb->cb_vdevs.cb_names, cb->cb_propname, cb->cb_value); } else { assert(cb->cb_type == ZFS_TYPE_POOL); error = set_pool_callback(zhp, cb); } cb->cb_any_successful = !error; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; char *vdev = NULL; current_prop_type = ZFS_TYPE_POOL; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 4) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; argc -= 2; argv += 2; /* argv[0] is pool name */ if (!is_pool(argv[0])) { (void) fprintf(stderr, gettext("cannot open '%s': is not a pool\n"), argv[0]); return (EINVAL); } /* argv[1], when supplied, is vdev name */ if (argc == 2) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); if (!are_vdevs_in_pool(1, &vdev, argv[0], &cb.cb_vdevs)) { (void) fprintf(stderr, gettext( "cannot find '%s' in '%s': device not in pool\n"), vdev, argv[0]); free(vdev); return (EINVAL); } cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = 1; cb.cb_type = ZFS_TYPE_VDEV; } error = for_each_pool(1, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, set_callback, &cb); if (vdev != NULL) free(vdev); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; const char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; pool_raidz_expand_stat_t *pres = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) fputc('\n', stdout); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); if (pres != NULL && pres->pres_state == DSS_SCANNING) { int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c, i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': /* Reset activities array */ memset(&wd.wd_enabled, 0, sizeof (wd.wd_enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", "raidz_expand" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { wd.wd_enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } +/* + * zpool ddtprune -d|-p + * + * -d Prune entries old and older + * -p Prune amount of entries + * + * Prune single reference entries from DDT to satisfy the amount specified. + */ +int +zpool_do_ddt_prune(int argc, char **argv) +{ + zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE; + uint64_t amount = 0; + zpool_handle_t *zhp; + char *endptr; + int c; + + while ((c = getopt(argc, argv, "d:p:")) != -1) { + switch (c) { + case 'd': + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with -p option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || amount == 0) { + (void) fprintf(stderr, + gettext("invalid days value\n")); + usage(B_FALSE); + } + amount *= 86400; /* convert days to seconds */ + unit = ZPOOL_DDT_PRUNE_AGE; + break; + case 'p': + if (unit == ZPOOL_DDT_PRUNE_AGE) { + (void) fprintf(stderr, gettext("-p cannot be " + "combined with -d option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || + amount == 0 || amount > 100) { + (void) fprintf(stderr, + gettext("invalid percentage value\n")); + usage(B_FALSE); + } + unit = ZPOOL_DDT_PRUNE_PERCENTAGE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (unit == ZPOOL_DDT_PRUNE_NONE) { + (void) fprintf(stderr, + gettext("missing amount option (-d|-p )\n")); + usage(B_FALSE); + } else if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } else if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + zhp = zpool_open(g_zfs, argv[0]); + if (zhp == NULL) + return (-1); + + int error = zpool_ddt_prune(zhp, unit, amount); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { for (int i = 0; i < NCOMMAND; ++i) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; while ((c = getopt(argc, argv, "j")) != -1) { switch (c) { case 'j': json = B_TRUE; jsobj = zpool_json_schema(0, 1); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; if (argc != 0) { (void) fprintf(stderr, "too many arguments\n"); usage(B_FALSE); } if (json) { zfs_ver = zfs_version_nvlist(); if (zfs_ver) { fnvlist_add_nvlist(jsobj, "zfs_version", zfs_ver); zcmd_print_json(jsobj); fnvlist_free(zfs_ver); return (0); } else return (-1); } else return (zfs_version_print() != 0); } /* Display documentation */ static int zpool_do_help(int argc, char **argv) { char page[MAXNAMELEN]; if (argc < 3 || strcmp(argv[2], "zpool") == 0) strcpy(page, "zpool"); else if (strcmp(argv[2], "concepts") == 0 || strcmp(argv[2], "props") == 0) snprintf(page, sizeof (page), "zpool%s", argv[2]); else snprintf(page, sizeof (page), "zpool-%s", argv[2]); execlp("man", "man", page, NULL); fprintf(stderr, "couldn't run man program: %s", strerror(errno)); return (-1); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char report[1024]; zpool_compat_status_t ret; ret = zpool_load_compat(compat, list, report, 1024); switch (ret) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_NOFILES: case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: (void) fprintf(stderr, "Error: %s\n", report); break; case ZPOOL_COMPATIBILITY_WARNTOKEN: (void) fprintf(stderr, "Warning: %s\n", report); ret = ZPOOL_COMPATIBILITY_OK; break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); /* * Special case 'help' */ if (strcmp(cmdname, "help") == 0) return (zpool_do_help(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/cmd/ztest.c b/cmd/ztest.c index 7c9db84d4ea4..a7843d338834 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -1,9059 +1,9087 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2023, Klara, Inc. */ /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. * * The overall design of the ztest program is as follows: * * (1) For each major functional area (e.g. adding vdevs to a pool, * creating and destroying datasets, reading and writing objects, etc) * we have a simple routine to test that functionality. These * individual routines do not have to do anything "stressful". * * (2) We turn these simple functionality tests into a stress test by * running them all in parallel, with as many threads as desired, * and spread across as many datasets, objects, and vdevs as desired. * * (3) While all this is happening, we inject faults into the pool to * verify that self-healing data really works. * * (4) Every time we open a dataset, we change its checksum and compression * functions. Thus even individual objects vary from block to block * in which checksum they use and whether they're compressed. * * (5) To verify that we never lose on-disk consistency after a crash, * we run the entire test in a child of the main process. * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number * as part of their data. When reading old data, they verify that * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * * (7) Threads are created with a reduced stack size, for sanity checking. * Therefore, it's important not to allocate huge buffers on the stack. * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. * * To turn this into an overnight stress test, use -T to specify run time. * * You can ask more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. * * When ztest invokes itself it passes all relevant information through a * temporary file which is mmap-ed in the child process. This allows shared * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always * stored at offset 0 of this file and contains information on the size and * number of shared structures in the file. The information stored in this file * must remain backwards compatible with older versions of ztest so that * ztest can invoke them during backwards compatibility testing (-B). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ztest_fd_data = -1; static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; uint64_t zh_opts_size; uint64_t zh_size; uint64_t zh_stats_size; uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; uint64_t zh_scratch_state_size; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; enum ztest_class_state { ZTEST_VDEV_CLASS_OFF, ZTEST_VDEV_CLASS_ON, ZTEST_VDEV_CLASS_RND }; /* Dedicated RAIDZ Expansion test states */ typedef enum { RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ RAIDZ_EXPAND_STARTED, /* Testing has commenced */ RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ } raidz_expand_test_state_t; #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; char zo_alt_ztest[MAXNAMELEN]; char zo_alt_libpath[MAXNAMELEN]; uint64_t zo_vdevs; uint64_t zo_vdevtime; size_t zo_vdev_size; int zo_ashift; int zo_mirrors; int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; int zo_draid_data; int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; uint64_t zo_killrate; int zo_verbose; int zo_init; uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; raidz_expand_test_state_t zo_raidz_expand_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; int zo_gvars_count; char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; } ztest_shared_opts_t; /* Default values for command line options. */ #define DEFAULT_POOL "ztest" #define DEFAULT_VDEV_DIR "/tmp" #define DEFAULT_VDEV_COUNT 5 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ #define DEFAULT_VDEV_SIZE_STR "256M" #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT #define DEFAULT_MIRRORS 2 #define DEFAULT_RAID_CHILDREN 4 #define DEFAULT_RAID_PARITY 1 #define DEFAULT_DRAID_DATA 4 #define DEFAULT_DRAID_SPARES 1 #define DEFAULT_DATASETS_COUNT 7 #define DEFAULT_THREADS 23 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ #define DEFAULT_RUN_TIME_STR "300 sec" #define DEFAULT_PASS_TIME 60 /* 60 seconds */ #define DEFAULT_PASS_TIME_STR "60 sec" #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ #define DEFAULT_KILLRATE_STR "70%" #define DEFAULT_INITS 1 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ #define DEFAULT_FORCE_GANGING (64 << 10) #define DEFAULT_FORCE_GANGING_STR "64K" /* Simplifying assumption: -1 is not a valid default. */ #define NO_DEFAULT -1 static const ztest_shared_opts_t ztest_opts_defaults = { .zo_pool = DEFAULT_POOL, .zo_dir = DEFAULT_VDEV_DIR, .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, .zo_vdevs = DEFAULT_VDEV_COUNT, .zo_ashift = DEFAULT_ASHIFT, .zo_mirrors = DEFAULT_MIRRORS, .zo_raid_children = DEFAULT_RAID_CHILDREN, .zo_raid_parity = DEFAULT_RAID_PARITY, .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = DEFAULT_VDEV_SIZE, .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ .zo_datasets = DEFAULT_DATASETS_COUNT, .zo_threads = DEFAULT_THREADS, .zo_passtime = DEFAULT_PASS_TIME, .zo_killrate = DEFAULT_KILL_RATE, .zo_verbose = 0, .zo_mmp_test = 0, .zo_init = DEFAULT_INITS, .zo_time = DEFAULT_RUN_TIME, .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, }; extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern uint_t metaslab_preload_limit; extern int zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; +extern boolean_t ddt_prune_artificial_age; +extern boolean_t ddt_dump_prune_histogram; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; typedef struct ztest_shared_ds { uint64_t zd_seq; } ztest_shared_ds_t; static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) typedef struct ztest_scratch_state { uint64_t zs_raidz_scratch_verify_pause; } ztest_shared_scratch_state_t; static ztest_shared_scratch_state_t *ztest_scratch_state; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, ZTEST_IO_WRITE_PATTERN, ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; uint64_t bt_crtxg; } ztest_block_tag_t; typedef struct bufwad { uint64_t bw_index; uint64_t bw_txg; uint64_t bw_data; } bufwad_t; /* * It would be better to use a rangelock_t per object. Unfortunately * the rangelock_t is not a drop-in replacement for rl_t, because we * still need to map from object ID to rangelock_t. */ typedef enum { ZTRL_READER, ZTRL_WRITER, ZTRL_APPEND } rl_type_t; typedef struct rll { void *rll_writer; int rll_readers; kmutex_t rll_lock; kcondvar_t rll_cv; } rll_t; typedef struct rl { uint64_t rl_object; uint64_t rl_offset; uint64_t rl_size; rll_t *rl_lock; } rl_t; #define ZTEST_RANGE_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64 /* * Object descriptor. Used as a template for object lookup/create/remove. */ typedef struct ztest_od { uint64_t od_dir; uint64_t od_object; dmu_object_type_t od_type; dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { ztest_shared_ds_t *zd_shared; objset_t *zd_os; pthread_rwlock_t zd_zilog_lock; zilog_t *zd_zilog; ztest_od_t *zd_od; /* debugging aid */ char zd_name[ZFS_MAX_DATASET_NAME_LEN]; kmutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; } ztest_ds_t; /* * Per-iteration state. */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ const char *zi_funcname; /* name of test function */ } ztest_info_t; typedef struct ztest_shared_callstate { uint64_t zc_count; /* per-pass count */ uint64_t zc_time; /* per-pass time */ uint64_t zc_next; /* next time to call this function */ } ztest_shared_callstate_t; static ztest_shared_callstate_t *ztest_shared_callstate; #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_raidz_attach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; +ztest_func_t ztest_ddt_prune; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ #define ZTI_INIT(func, iters, interval) \ { .zi_func = (func), \ .zi_iters = (iters), \ .zi_interval = (interval), \ .zi_funcname = # func } static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), ZTI_INIT(ztest_zap, 30, &zopt_always), ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), #if 0 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), #endif ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), ZTI_INIT(ztest_reguid, 1, &zopt_rarely), ZTI_INIT(ztest_scrub, 1, &zopt_rarely), ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), ZTI_INIT(ztest_blake3, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), + ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. */ typedef struct ztest_cb_list { kmutex_t zcl_callbacks_lock; list_t zcl_callbacks; } ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; hrtime_t zs_thread_stop; hrtime_t zs_thread_kill; uint64_t zs_enospc_count; uint64_t zs_vdev_next_leaf; uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; static ztest_shared_t *ztest_shared; static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by * the individual tests. To modify the namespace, consumers must grab * this lock as writer. Grabbing the lock as reader will ensure that the * namespace does not change while the lock is held. */ static pthread_rwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; /* Commit cb delay */ static uint64_t zc_min_txg_delay = UINT64_MAX; static int zc_cb_counter = 0; /* * Minimum number of commit callbacks that need to be registered for us to check * whether the minimum txg delay is acceptable. */ #define ZTEST_COMMIT_CB_MIN_REG 100 /* * If a number of txgs equal to this threshold have been created after a commit * callback has been registered but not called, then we assume there is an * implementation bug. */ #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, ZTEST_OBJECTS }; static __attribute__((noreturn)) void usage(boolean_t requested); static int ztest_scrub_impl(spa_t *spa); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!ztest_opts.zo_dump_dbgmsg) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDERR_FILENO, "\n", 1); zfs_dbgmsg_print(STDERR_FILENO, "ztest"); } static void sig_handler(int signo) { struct sigaction action; libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } #define FATAL_MSG_SZ 1024 static const char *fatal_msg; static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void fatal(int do_perror, const char *message, ...) { va_list args; int save_errno = errno; char *buf; (void) fflush(stdout); buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); if (buf == NULL) goto out; va_start(args, message); (void) sprintf(buf, "ztest: "); /* LINTED */ (void) vsprintf(buf + strlen(buf), message, args); va_end(args); if (do_perror) { (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ out: if (ztest_dump_core) abort(); else dump_debug_buffer(); exit(3); } static int str2shift(const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { return (10*i); } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } static uint64_t nicenumtoull(const char *buf) { char *end; uint64_t val; val = strtoull(buf, &end, 0); if (end == buf) { (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); usage(B_FALSE); } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val = (uint64_t)fval; } else { int shift = str2shift(end); if (shift >= 64 || (val << shift) >> shift != val) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val <<= shift; } return (val); } typedef struct ztest_option { const char short_opt; const char *long_opt; const char *long_opt_param; const char *comment; unsigned int default_int; const char *default_str; } ztest_option_t; /* * The following option_table is used for generating the usage info as well as * the long and short option information for calling getopt_long(). */ static ztest_option_t option_table[] = { { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, NULL}, { 's', "vdev-size", "INTEGER", "Size of each vdev", NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, { 'a', "alignment-shift", "INTEGER", "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", DEFAULT_MIRRORS, NULL}, { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, { 'S', "draid-spares", "INTEGER", "Number of draid spares", DEFAULT_DRAID_SPARES, NULL}, { 'd', "datasets", "INTEGER", "Number of datasets", DEFAULT_DATASETS_COUNT, NULL}, { 't', "threads", "INTEGER", "Number of ztest threads", DEFAULT_THREADS, NULL}, { 'g', "gang-block-threshold", "INTEGER", "Metaslab gang block threshold", NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, { 'i', "init-count", "INTEGER", "Number of times to initialize pool", DEFAULT_INITS, NULL}, { 'k', "kill-percentage", "INTEGER", "Kill percentage", NO_DEFAULT, DEFAULT_KILLRATE_STR}, { 'p', "pool-name", "STRING", "Pool name", NO_DEFAULT, DEFAULT_POOL}, { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", NO_DEFAULT, DEFAULT_VDEV_DIR}, { 'M', "multi-host", NULL, "Multi-host; simulate pool imported on remote host", NO_DEFAULT, NULL}, { 'E', "use-existing-pool", NULL, "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, { 'T', "run-time", "INTEGER", "Total run time", NO_DEFAULT, DEFAULT_RUN_TIME_STR}, { 'P', "pass-time", "INTEGER", "Time per pass", NO_DEFAULT, DEFAULT_PASS_TIME_STR}, { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", DEFAULT_MAX_LOOPS, NULL}, { 'B', "alt-ztest", "PATH", "Alternate ztest path", NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, { 'X', "raidz-expansion", NULL, "Perform a dedicated raidz expansion test", NO_DEFAULT, NULL}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, { 'G', "dump-debug-msg", NULL, "Dump zfs_dbgmsg buffer before exiting due to an error", NO_DEFAULT, NULL}, { 'V', "verbose", NULL, "Verbose (use multiple times for ever more verbosity)", NO_DEFAULT, NULL}, { 'h', "help", NULL, "Show this help", NO_DEFAULT, NULL}, {0, 0, 0, 0, 0, 0} }; static struct option *long_opts = NULL; static char *short_opts = NULL; static void init_options(void) { ASSERT3P(long_opts, ==, NULL); ASSERT3P(short_opts, ==, NULL); int count = sizeof (option_table) / sizeof (option_table[0]); long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); int short_opt_index = 0; for (int i = 0; i < count; i++) { long_opts[i].val = option_table[i].short_opt; long_opts[i].name = option_table[i].long_opt; long_opts[i].has_arg = option_table[i].long_opt_param != NULL ? required_argument : no_argument; long_opts[i].flag = NULL; short_opts[short_opt_index++] = option_table[i].short_opt; if (option_table[i].long_opt_param != NULL) { short_opts[short_opt_index++] = ':'; } } } static void fini_options(void) { int count = sizeof (option_table) / sizeof (option_table[0]); umem_free(long_opts, sizeof (struct option) * count); umem_free(short_opts, sizeof (char) * 2 * count); long_opts = NULL; short_opts = NULL; } static __attribute__((noreturn)) void usage(boolean_t requested) { char option[80]; FILE *fp = requested ? stdout : stderr; (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); for (int i = 0; option_table[i].short_opt != 0; i++) { if (option_table[i].long_opt_param != NULL) { (void) sprintf(option, " -%c --%s=%s", option_table[i].short_opt, option_table[i].long_opt, option_table[i].long_opt_param); } else { (void) sprintf(option, " -%c --%s", option_table[i].short_opt, option_table[i].long_opt); } (void) fprintf(fp, " %-43s%s", option, option_table[i].comment); if (option_table[i].long_opt_param != NULL) { if (option_table[i].default_str != NULL) { (void) fprintf(fp, " (default: %s)", option_table[i].default_str); } else if (option_table[i].default_int != NO_DEFAULT) { (void) fprintf(fp, " (default: %u)", option_table[i].default_int); } } (void) fprintf(fp, "\n"); } exit(requested ? 0 : 1); } static uint64_t ztest_random(uint64_t range) { uint64_t r; ASSERT3S(ztest_fd_rand, >=, 0); if (range == 0) return (0); if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(B_TRUE, "short read from /dev/urandom"); return (r % range); } static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) { char name[32]; char *value; int state = ZTEST_VDEV_CLASS_RND; (void) strlcpy(name, input, sizeof (name)); value = strchr(name, '='); if (value == NULL) { (void) fprintf(stderr, "missing value in property=value " "'-C' argument (%s)\n", input); usage(B_FALSE); } *(value) = '\0'; value++; if (strcmp(value, "on") == 0) { state = ZTEST_VDEV_CLASS_ON; } else if (strcmp(value, "off") == 0) { state = ZTEST_VDEV_CLASS_OFF; } else if (strcmp(value, "random") == 0) { state = ZTEST_VDEV_CLASS_RND; } else { (void) fprintf(stderr, "invalid property value '%s'\n", value); usage(B_FALSE); } if (strcmp(name, "special") == 0) { zo->zo_special_vdevs = state; } else { (void) fprintf(stderr, "invalid property name '%s'\n", name); usage(B_FALSE); } if (zo->zo_verbose >= 3) (void) printf("%s vdev state is '%s'\n", name, value); } static void process_options(int argc, char **argv) { char *path; ztest_shared_opts_t *zo = &ztest_opts; int opt; uint64_t value; const char *raid_kind = "random"; memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); init_options(); while ((opt = getopt_long(argc, argv, short_opts, long_opts, NULL)) != EOF) { value = 0; switch (opt) { case 'v': case 's': case 'a': case 'm': case 'r': case 'R': case 'D': case 'S': case 'd': case 't': case 'g': case 'i': case 'k': case 'T': case 'P': case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': zo->zo_vdevs = value; break; case 's': zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': zo->zo_ashift = value; break; case 'm': zo->zo_mirrors = value; break; case 'r': zo->zo_raid_children = MAX(1, value); break; case 'R': zo->zo_raid_parity = MIN(MAX(value, 1), 3); break; case 'K': raid_kind = optarg; break; case 'D': zo->zo_draid_data = MAX(1, value); break; case 'S': zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); break; case 't': zo->zo_threads = MAX(1, value); break; case 'g': zo->zo_metaslab_force_ganging = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; break; case 'k': zo->zo_killrate = value; break; case 'p': (void) strlcpy(zo->zo_pool, optarg, sizeof (zo->zo_pool)); break; case 'f': path = realpath(optarg, NULL); if (path == NULL) { (void) fprintf(stderr, "error: %s: %s\n", optarg, strerror(errno)); usage(B_FALSE); } else { (void) strlcpy(zo->zo_dir, path, sizeof (zo->zo_dir)); free(path); } break; case 'M': zo->zo_mmp_test = 1; break; case 'V': zo->zo_verbose++; break; case 'X': zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; break; case 'E': zo->zo_init = 0; break; case 'T': zo->zo_time = value; break; case 'P': zo->zo_passtime = MAX(1, value); break; case 'F': zo->zo_maxloops = MAX(1, value); break; case 'B': (void) strlcpy(zo->zo_alt_ztest, optarg, sizeof (zo->zo_alt_ztest)); break; case 'C': ztest_parse_name_value(optarg, zo); break; case 'o': if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { (void) fprintf(stderr, "max global var count (%zu) exceeded\n", ZO_GVARS_MAX_COUNT); usage(B_FALSE); } char *v = zo->zo_gvars[zo->zo_gvars_count]; if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= ZO_GVARS_MAX_ARGLEN) { (void) fprintf(stderr, "global var option '%s' is too long\n", optarg); usage(B_FALSE); } zo->zo_gvars_count++; break; case 'G': zo->zo_dump_dbgmsg = 1; break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } fini_options(); /* Force compatible options for raidz expansion run */ if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { zo->zo_mmp_test = 0; zo->zo_mirrors = 0; zo->zo_vdevs = 1; zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; zo->zo_raid_do_expand = B_FALSE; raid_kind = "raidz"; } if (strcmp(raid_kind, "random") == 0) { switch (ztest_random(3)) { case 0: raid_kind = "raidz"; break; case 1: raid_kind = "eraidz"; break; case 2: raid_kind = "draid"; break; } if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); } if (strcmp(raid_kind, "draid") == 0) { uint64_t min_devsize; /* With fewer disk use 256M, otherwise 128M is OK */ min_devsize = (ztest_opts.zo_raid_children < 16) ? (256ULL << 20) : (128ULL << 20); /* No top-level mirrors with dRAID for now */ zo->zo_mirrors = 0; /* Use more appropriate defaults for dRAID */ if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) zo->zo_vdevs = 1; if (zo->zo_raid_children == ztest_opts_defaults.zo_raid_children) zo->zo_raid_children = 16; if (zo->zo_ashift < 12) zo->zo_ashift = 12; if (zo->zo_vdev_size < min_devsize) zo->zo_vdev_size = min_devsize; if (zo->zo_draid_data + zo->zo_raid_parity > zo->zo_raid_children - zo->zo_draid_spares) { (void) fprintf(stderr, "error: too few draid " "children (%d) for stripe width (%d)\n", zo->zo_raid_children, zo->zo_draid_data + zo->zo_raid_parity); usage(B_FALSE); } (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); } else if (strcmp(raid_kind, "eraidz") == 0) { /* using eraidz (expandable raidz) */ zo->zo_raid_do_expand = B_TRUE; /* tests expect top-level to be raidz */ zo->zo_mirrors = 0; zo->zo_vdevs = 1; /* Make sure parity is less than data columns */ zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); if (*zo->zo_alt_ztest) { const char *invalid_what = "ztest"; char *val = zo->zo_alt_ztest; if (0 != access(val, X_OK) || (strrchr(val, '/') == NULL && (errno == EINVAL))) goto invalid; int dirlen = strrchr(val, '/') - val; strlcpy(zo->zo_alt_libpath, val, MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); invalid_what = "library path", val = zo->zo_alt_libpath; if (strrchr(val, '/') == NULL && (errno == EINVAL)) goto invalid; *strrchr(val, '/') = '\0'; strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); if (0 != access(zo->zo_alt_libpath, X_OK)) goto invalid; return; invalid: ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); } } static void ztest_kill(ztest_shared_t *zs) { zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); /* * Before we kill ourselves, make sure that the config is updated. * See comment above spa_write_cachefile(). */ if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { if (mutex_tryenter(&spa_namespace_lock)) { spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); ztest_scratch_state->zs_raidz_scratch_verify_pause = raidz_expand_pause_point; } else { /* * Do not verify scratch object in case if * spa_namespace_lock cannot be acquired, * it can cause deadlock in spa_config_update(). */ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; return; } } else { mutex_enter(&spa_namespace_lock); spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); } (void) raise(SIGKILL); } static void ztest_record_enospc(const char *s) { (void) s; ztest_shared->zs_enospc_count++; } static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } static boolean_t ztest_is_draid_spare(const char *name) { uint64_t spare_id = 0, parity = 0, vdev_id = 0; if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", &parity, &vdev_id, &spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } static nvlist_t * make_vdev_file(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift) { char *pathbuf = NULL; uint64_t vdev; nvlist_t *file; boolean_t draid_spare = B_FALSE; if (ashift == 0) ashift = ztest_get_ashift(); if (path == NULL) { pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); path = pathbuf; if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; (void) snprintf(pathbuf, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(pathbuf, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } else { draid_spare = ztest_is_draid_spare(path); } if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(B_TRUE, "can't open %s", path); if (ftruncate(fd, size) != 0) fatal(B_TRUE, "can't ftruncate %s", path); (void) close(fd); } file = fnvlist_alloc(); fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, int r) { nvlist_t *raid, **child; int c; if (r < 2) return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); raid = fnvlist_alloc(); fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, ztest_opts.zo_raid_type); fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, ztest_opts.zo_raid_parity); fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, r); if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { uint64_t ndata = ztest_opts.zo_draid_data; uint64_t nparity = ztest_opts.zo_raid_parity; uint64_t nspares = ztest_opts.zo_draid_spares; uint64_t children = ztest_opts.zo_raid_children; uint64_t ngroups = 1; /* * Calculate the minimum number of groups required to fill a * slice. This is the LCM of the stripe width (data + parity) * and the number of data drives (children - spares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); } for (c = 0; c < r; c++) fnvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); return (raid); } static nvlist_t * make_vdev_mirror(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); mirror = fnvlist_alloc(); fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, m); for (c = 0; c < m; c++) fnvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); return (mirror); } static nvlist_t * make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, uint64_t ashift, const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; boolean_t log; ASSERT3S(t, >, 0); log = (class != NULL && strcmp(class, "log") == 0); child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); if (class != NULL && class[0] != '\0') { ASSERT(m > 1 || log); /* expecting a mirror */ fnvlist_add_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, class); } } root = fnvlist_alloc(); fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, t); for (c = 0; c < t; c++) fnvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); return (root); } /* * Find a random spa version. Returns back a random spa version in the * range [initial_version, SPA_VERSION_FEATURES]. */ static uint64_t ztest_random_spa_version(uint64_t initial_version) { uint64_t version = initial_version; if (version <= SPA_VERSION_BEFORE_FEATURES) { version = version + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); } if (version > SPA_VERSION_BEFORE_FEATURES) version = SPA_VERSION_FEATURES; ASSERT(SPA_VERSION_IS_SUPPORTED(version)); return (version); } static int ztest_random_blocksize(void) { ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. */ int maxbs = SPA_OLD_MAXBLOCKSHIFT; if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) maxbs = 20; uint64_t block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int ztest_random_dnodesize(void) { int slots; int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; if (max_slots == DNODE_MIN_SLOTS) return (DNODE_MIN_SIZE); /* * Weight the random distribution more heavily toward smaller * dnode sizes since that is more likely to reflect real-world * usage. */ ASSERT3U(max_slots, >, 4); switch (ztest_random(10)) { case 0: slots = 5 + ztest_random(max_slots - 4); break; case 1 ... 4: slots = 2 + ztest_random(3); break; default: slots = 1; break; } return (slots << DNODE_SHIFT); } static int ztest_random_ibshift(void) { return (DN_MIN_INDBLKSHIFT + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } static uint64_t ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { uint64_t top; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); } static uint64_t ztest_random_dsl_prop(zfs_prop_t prop) { uint64_t value; do { value = zfs_prop_random_value(prop, ztest_random(-1ULL)); } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); return (value); } static int ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, boolean_t inherit) { const char *propname = zfs_prop_to_name(prop); const char *valname; char *setpoint; uint64_t curval; int error; error = dsl_prop_set_int(osname, propname, (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); if (ztest_opts.zo_verbose >= 6) { int err; err = zfs_prop_index_to_string(prop, curval, &valname); if (err) (void) printf("%s %s = %llu at '%s'\n", osname, propname, (unsigned long long)curval, setpoint); else (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); } umem_free(setpoint, MAXPATHLEN); return (error); } static int ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); error = spa_prop_set(spa, props); fnvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); return (error); } static int ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { int err; char *cp = NULL; char ddname[ZFS_MAX_DATASET_NAME_LEN]; strlcpy(ddname, name, sizeof (ddname)); cp = strchr(ddname, '@'); if (cp != NULL) *cp = '\0'; err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); while (decrypt && err == EACCES) { dsl_crypto_params_t *dcp; nvlist_t *crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); /* * Note: if there was an error loading, the wkey was not * consumed, and needs to be freed. */ dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { /* * We couldn't load a key for this dataset so try * the parent. This loop will eventually hit the * encryption root since ztest only makes clones * as children of their origin datasets. */ cp = strrchr(ddname, '/'); if (cp == NULL) return (err); *cp = '\0'; err = EACCES; continue; } else if (err != 0) { break; } err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); break; } return (err); } static void ztest_rll_init(rll_t *rll) { rll->rll_writer = NULL; rll->rll_readers = 0; mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); } static void ztest_rll_destroy(rll_t *rll) { ASSERT3P(rll->rll_writer, ==, NULL); ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } static void ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); if (type == ZTRL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; } else { while (rll->rll_writer != NULL || rll->rll_readers) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_writer = curthread; } mutex_exit(&rll->rll_lock); } static void ztest_rll_unlock(rll_t *rll) { mutex_enter(&rll->rll_lock); if (rll->rll_writer) { ASSERT0(rll->rll_readers); rll->rll_writer = NULL; } else { ASSERT3S(rll->rll_readers, >, 0); ASSERT3P(rll->rll_writer, ==, NULL); rll->rll_readers--; } if (rll->rll_writer == NULL && rll->rll_readers == 0) cv_broadcast(&rll->rll_cv); mutex_exit(&rll->rll_lock); } static void ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_lock(rll, type); } static void ztest_object_unlock(ztest_ds_t *zd, uint64_t object) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_unlock(rll); } static rl_t * ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; rl->rl_offset = offset; rl->rl_size = size; rl->rl_lock = rll; ztest_rll_lock(rll, type); return (rl); } static void ztest_range_unlock(rl_t *rl) { rll_t *rll = rl->rl_lock; ztest_rll_unlock(rll); umem_free(rl, sizeof (*rl)); } static void ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); int l; if (zd->zd_shared != NULL) zd->zd_shared->zd_seq = 0; VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_init(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_init(&zd->zd_range_lock[l]); } static void ztest_zd_fini(ztest_ds_t *zd) { int l; mutex_destroy(&zd->zd_dirobj_lock); (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_destroy(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_destroy(&zd->zd_range_lock[l]); } #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) static uint64_t ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) { uint64_t txg; int error; /* * Attempt to assign tx to some transaction group. */ error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { ASSERT3U(txg_how, ==, TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); ztest_record_enospc(tag); } dmu_tx_abort(tx); return (0); } txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); return (txg); } static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; bt->bt_crtxg = crtxg; } static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * ztest_bt_bonus(dmu_buf_t *db) { dmu_object_info_t doi; ztest_block_tag_t *bt; dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); return (bt); } /* * Generate a token to fill up unused bonus buffer space. Try to make * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* * Fill up the unused bonus buffer region before the block tag with a * verifiable pattern. Filling the whole bonus area with non-zero data * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ static void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); *bonusp = token; } } /* * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ static void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); VERIFY3U(*bonusp, ==, token); } } /* * ZIL logging ops */ #define lrz_type lr_mode #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev #define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) + namesize - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) + namesize - sizeof (lr_t)); itx->itx_oid = object; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) return; if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } /* * ZIL replay ops */ static int ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_create_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; ztest_block_tag_t *bbt; dmu_buf_t *db; dmu_tx_t *tx; uint64_t txg; int error = 0; int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } else { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); } txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) return (ENOSPC); ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } if (error) { ASSERT3U(error, ==, EEXIST); ASSERT(zd->zd_zilog->zl_replay); dmu_tx_commit(tx); return (error); } ASSERT3U(lr->lr_foid, !=, 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, lr->lr_gen, txg, txg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); dmu_tx_commit(tx); return (0); } static int ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_remove_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object, txg; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); VERIFY0( zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); ztest_object_lock(zd, object, ZTRL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_object_unlock(zd, object); return (ENOSPC); } if (doi.doi_type == DMU_OT_ZAP_OTHER) { VERIFY0(zap_destroy(os, object, tx)); } else { VERIFY0(dmu_object_free(os, object, tx)); } VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); ztest_object_unlock(zd, object); return (0); } static int ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_write_t *lr = arg2; objset_t *os = zd->zd_os; void *data = lr + 1; /* data follows lr */ uint64_t offset, length; ztest_block_tag_t *bt = data; ztest_block_tag_t *bbt; uint64_t gen, txg, lrtxg, crtxg; dmu_object_info_t doi; dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } if (bt->bt_magic == BSWAP_64(BT_MAGIC)) byteswap_uint64_array(bt, sizeof (*bt)); if (bt->bt_magic != BT_MAGIC) bt = NULL; ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); gen = bbt->bt_gen; crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, lr->lr_foid, offset, length); if (ztest_random(8) == 0 && length == doi.doi_data_block_size && P2PHASE(offset, length) == 0) abuf = dmu_request_arcbuf(db, length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } if (bt != NULL) { /* * Usually, verify the old data before writing new data -- * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ ASSERT(doi.doi_data_block_size); ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } /* * Writes can appear to be newer than the bonus buffer because * the ztest_get_data() callback does a dmu_read() of the * open-context data, which may be different than the data * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } /* * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { memcpy(abuf->b_data, data, length); VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); } (void) ztest_log_write(zd, tx, lr); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_truncate_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, ZTRL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx)); (void) ztest_log_truncate(zd, tx, lr); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_setattr_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT3U(lr->lr_size, !=, 0); ASSERT3U(lr->lr_mode, !=, 0); ASSERT3U(lrtxg, !=, 0); } else { /* * Randomly change the size and increment the generation. */ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; ASSERT0(lrtxg); } /* * Verify that the current bonus buffer is not newer than our txg. */ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); dmu_tx_commit(tx); ztest_object_unlock(zd, lr->lr_foid); return (0); } static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* 0 no such transaction type */ ztest_replay_create, /* TX_CREATE */ NULL, /* TX_MKDIR */ NULL, /* TX_MKXATTR */ NULL, /* TX_SYMLINK */ ztest_replay_remove, /* TX_REMOVE */ NULL, /* TX_RMDIR */ NULL, /* TX_LINK */ NULL, /* TX_RENAME */ ztest_replay_write, /* TX_WRITE */ ztest_replay_truncate, /* TX_TRUNCATE */ ztest_replay_setattr, /* TX_SETATTR */ NULL, /* TX_ACL */ NULL, /* TX_CREATE_ACL */ NULL, /* TX_CREATE_ATTR */ NULL, /* TX_CREATE_ACL_ATTR */ NULL, /* TX_MKDIR_ACL */ NULL, /* TX_MKDIR_ATTR */ NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ NULL, /* TX_SETSAXATTR */ NULL, /* TX_RENAME_EXCHANGE */ NULL, /* TX_RENAME_WHITEOUT */ }; /* * ZIL get_data callbacks */ static void ztest_get_done(zgd_t *zgd, int error) { (void) error; ztest_ds_t *zd = zgd->zgd_private; uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); } static int ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { (void) arg2; ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; uint64_t txg = lr->lr_common.lrc_txg; uint64_t crtxg; dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, ZTRL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); return (error); } crtxg = ztest_bt_bonus(db)->bt_crtxg; if (crtxg == 0 || crtxg > txg) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, object); return (ENOENT); } dmu_object_info_from_db(db, &doi); dmu_buf_rele(db, FTAG); db = NULL; zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_lwb = lwb; zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN_TYPED(offset, size, uint64_t); } else { ASSERT3U(offset, <, size); offset = 0; } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, ZTRL_READER); error = dmu_buf_hold_noread(os, object, offset, zgd, &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT3U(db->db_offset, ==, offset); ASSERT3U(db->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); if (error == 0) return (0); } } ztest_get_done(zgd, error); return (error); } static void * ztest_lr_alloc(size_t lrsize, char *name) { char *lr; size_t namesize = name ? strlen(name) + 1 : 0; lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); if (name) memcpy(lr + lrsize, name, namesize); return (lr); } static void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; umem_free(lr, lrsize + namesize); } /* * Lookup a bunch of objects. Returns the number of objects not found. */ static int ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { od->od_object = 0; error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { ASSERT3S(error, ==, ENOENT); ASSERT0(od->od_object); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, ZTRL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); od->od_type = doi.doi_type; od->od_blocksize = doi.doi_data_block_size; od->od_gen = bbt->bt_gen; dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, od->od_object); } } return (missing); } static int ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { if (missing) { od->od_object = 0; missing++; continue; } lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ lr->lrz_type = od->od_crtype; lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { ASSERT0(missing); od->od_object = 0; missing++; } else { od->od_object = lr->lr_foid; od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; ASSERT3U(od->od_object, !=, 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); od += count - 1; for (i = count - 1; i >= 0; i--, od--) { if (missing) { missing++; continue; } /* * No object was found. */ if (od->od_object == 0) continue; lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { ASSERT3U(error, ==, ENOSPC); missing++; } else { od->od_object = 0; } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, const void *data) { lr_write_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); memcpy(lr + 1, data, size); error = ztest_replay_write(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr) + size, NULL); return (error); } static int ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { lr_truncate_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; error = ztest_replay_truncate(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static int ztest_setattr(ztest_ds_t *zd, uint64_t object) { lr_setattr_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_size = 0; lr->lr_mode = 0; error = ztest_replay_setattr(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static void ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, ZTRL_READER); rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { (void) dmu_free_long_range(os, object, offset, size); } ztest_range_unlock(rl); ztest_object_unlock(zd, object); } static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; uint64_t blocksize; void *data; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); /* * Pick an i/o type at random, biased toward writing block tags. */ io_type = ztest_random(ZTEST_IO_TYPES); if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); switch (io_type) { case ZTEST_IO_WRITE_TAG: ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; case ZTEST_IO_WRITE_PATTERN: (void) memset(data, 'a' + (object + offset) % 5, blocksize); if (ztest_random(2) == 0) { /* * Induce fletcher2 collisions to ensure that * zio_ddt_collision() detects and resolves them * when using fletcher2-verify for deduplication. */ ((uint64_t *)data)[0] ^= 1ULL << 63; ((uint64_t *)data)[4] ^= 1ULL << 63; } (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_WRITE_ZEROES: memset(data, 0, blocksize); (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_TRUNCATE: (void) ztest_truncate(zd, object, offset, blocksize); break; case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; default: break; case ZTEST_IO_REWRITE: (void) pthread_rwlock_rdlock(&ztest_name_lock); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), B_FALSE); ASSERT(err == 0 || err == ENOSPC); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COMPRESSION, ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), B_FALSE); ASSERT(err == 0 || err == ENOSPC); (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, DMU_READ_NO_PREFETCH)); (void) ztest_write(zd, object, offset, blocksize, data); break; } (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); umem_free(data, blocksize); } /* * Initialize an object description template. */ static void ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; od->od_blocksize = 0; od->od_gen = 0; (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%"PRId64")[%"PRIu64"]", tag, id, index); } /* * Lookup or create the objects for a test using the od template. * If the objects do not all exist, or if 'remove' is specified, * remove any existing objects and create new ones. Otherwise, * use the existing objects. */ static int ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) { int count = size / sizeof (*od); int rv = 0; mutex_enter(&zd->zd_dirobj_lock); if ((ztest_lookup(zd, od, count) != 0 || remove) && (ztest_remove(zd, od, count) != 0 || ztest_create(zd, od, count) != 0)) rv = -1; zd->zd_od = od; mutex_exit(&zd->zd_dirobj_lock); return (rv); } void ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { (void) id; zilog_t *zilog = zd->zd_zilog; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child * shared memory. If we die, the next iteration of ztest_run() * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); ASSERT3P(zd->zd_shared, !=, NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); } /* * This function is designed to simulate the operations that occur during a * mount/unmount operation. We hold the dataset across these operations in an * attempt to expose any implicit assumptions about ZIL management. */ void ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; /* * We hold the ztest_vdev_lock so we don't cause problems with * other threads that wish to remove a log device, such as * ztest_device_removal(). */ mutex_enter(&ztest_vdev_lock); /* * We grab the zd_dirobj_lock to ensure that no other thread is * updating the zil (i.e. adding in-memory log records) and the * zd_zilog_lock to block any I/O. */ mutex_enter(&zd->zd_dirobj_lock); (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); /* zfsvfs_teardown() */ zil_close(zd->zd_zilog); /* zfsvfs_setup() */ VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); mutex_exit(&zd->zd_dirobj_lock); mutex_exit(&ztest_vdev_lock); } /* * Verify that we can't destroy an active pool, create an existing pool, * or create a pool with a bad vdev spec. */ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; if (zo->zo_mmp_test) return; /* * Attempt to create using a bad file. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * We open a reference to the spa and then we try to export it * expecting one of the following errors: * * EBUSY * Because of the reference we just opened. * * ZFS_ERR_EXPORT_IN_PROGRESS * For the case that there is another ztest thread doing * an export concurrently. */ VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); int error = spa_destroy(zo->zo_pool); if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", spa->spa_name, error); } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Start and then stop the MMP threads to ensure the startup and shutdown code * works properly. Actual protection and property-related code tested via ZTS. */ void ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa = ztest_spa; if (zo->zo_mmp_test) return; /* * Since enabling MMP involves setting a property, it could not be done * while the pool is suspended. */ if (spa_suspended(spa)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); zfs_multihost_fail_intervals = 0; if (!spa_multihost(spa)) { spa->spa_multihost = B_TRUE; mmp_thread_start(spa); } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); mmp_signal_all_threads(); txg_wait_synced(spa_get_dsl(spa), 0); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); if (spa_multihost(spa)) { mmp_thread_stop(spa); spa->spa_multihost = B_FALSE; } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); } static int ztest_get_raidz_children(spa_t *spa) { (void) spa; vdev_t *raidvd; ASSERT(MUTEX_HELD(&ztest_vdev_lock)); if (ztest_opts.zo_raid_do_expand) { raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); return (raidvd->vdev_children); } return (ztest_opts.zo_raid_children); } void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; uint64_t raidz_children, version, newversion; nvlist_t *nvroot, *props; char *name; if (ztest_opts.zo_mmp_test) return; /* dRAID added after feature flags, skip upgrade test. */ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) return; mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); /* * Clean up from previous runs. */ (void) spa_destroy(name); raidz_children = ztest_get_raidz_children(ztest_spa); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, NULL, raidz_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; break; case 2: initial_version = SPA_VERSION_RAIDZ2; break; case 3: initial_version = SPA_VERSION_RAIDZ3; break; } /* * Create a pool with a spa version that can be upgraded. Pick * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. */ do { version = ztest_random_spa_version(initial_version); } while (version > SPA_VERSION_BEFORE_FEATURES); props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { (void) printf("upgrading spa version from " "%"PRIu64" to %"PRIu64"\n", version, newversion); } spa_upgrade(spa, newversion); VERIFY3U(spa_version(spa), >, version); VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); kmem_strfree(name); mutex_exit(&ztest_vdev_lock); } static void ztest_spa_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: break; case ENOSPC: ztest_record_enospc(FTAG); break; default: fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } static void ztest_spa_discard_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint_discard(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_NO_CHECKPOINT: break; default: fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } } void ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; mutex_enter(&ztest_checkpoint_lock); if (ztest_random(2) == 0) { ztest_spa_checkpoint(spa); } else { ztest_spa_discard_checkpoint(spa); } mutex_exit(&ztest_checkpoint_lock); } static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { vdev_t *mvd; int c; if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) return (vd); for (c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != NULL) return (mvd); return (NULL); } static int spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); return (rvd->vdev_children); } /* * Verify that vdev_add() works as expected. */ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; uint64_t raidz_children; nvlist_t *nvroot; int error; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); raidz_children = ztest_get_raidz_children(spa); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { metaslab_group_t *mg; /* * find the first real slog in log allocation class */ mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); /* * We have to grab the zs_name_lock as writer to * prevent a race between removing a slog (dmu_objset_find) * and destroying a dataset. Removing the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_remove(spa, guid, B_FALSE); pthread_rwlock_unlock(&ztest_name_lock); switch (error) { case 0: case EEXIST: /* Generic zil_reset() error */ case EBUSY: /* Replay required */ case EACCES: /* Crypto key not loaded */ case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(B_FALSE, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); /* * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? "log" : NULL, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); switch (error) { case 0: break; case ENOSPC: ztest_record_enospc("spa_vdev_add"); break; default: fatal(B_FALSE, "spa_vdev_add() = %d", error); } } mutex_exit(&ztest_vdev_lock); } void ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; uint64_t raidz_children; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; /* * By default add a special vdev 50% of the time */ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && ztest_random(2) == 0)) { return; } mutex_enter(&ztest_vdev_lock); /* Only test with mirrors */ if (zs->zs_mirrors < 2) { mutex_exit(&ztest_vdev_lock); return; } /* requires feature@allocation_classes */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { mutex_exit(&ztest_vdev_lock); return; } raidz_children = ztest_get_raidz_children(spa); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); else if (error != 0) fatal(B_FALSE, "spa_vdev_add() = %d", error); /* * 50% of the time allow small blocks in the special class */ if (error == 0 && spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { if (ztest_opts.zo_verbose >= 3) (void) printf("Enabling special VDEV small blocks\n"); error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); ASSERT(error == 0 || error == ENOSPC); } mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 3) { metaslab_class_t *mc; if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) mc = spa_special_class(spa); else mc = spa_dedup_class(spa); (void) printf("Added a %s mirrored vdev (of %d)\n", class, (int)mc->mc_groups); } } /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; const char *aux; char *path; uint64_t guid = 0; int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ztest_random(2) == 0) { sav = &spa->spa_spares; aux = ZPOOL_CONFIG_SPARES; } else { sav = &spa->spa_l2cache; aux = ZPOOL_CONFIG_L2CACHE; } mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* * Pick a random device to remove. */ vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) ignore_err = ENOTSUP; guid = svd->vdev_guid; } else { /* * Find an unused device we can add. */ zs->zs_vdev_aux = 0; for (;;) { int c; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, ztest_opts.zo_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) break; if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; zs->zs_vdev_aux++; } } spa_config_exit(spa, SCL_VDEV, FTAG); if (guid == 0) { /* * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot, B_FALSE); switch (error) { case 0: break; default: fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); } fnvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its * vdev state first to make sure we handle removal * of devices that have pending state changes. */ if (ztest_random(2) == 0) (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); switch (error) { case 0: case EBUSY: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: if (error != ignore_err) fatal(B_FALSE, "spa_vdev_remove(%"PRIu64") = %d", guid, error); } } mutex_exit(&ztest_vdev_lock); umem_free(path, MAXPATHLEN); } /* * split a pool if it has mirror tlvdevs */ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } /* clean up the old pool, if any */ (void) spa_destroy("splitp"); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); mutex_exit(&spa->spa_props_lock); VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children)); schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < children; c++) { vdev_t *tvd = rvd->vdev_child[c]; nvlist_t **mchild; uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { schild[schildren] = fnvlist_alloc(); fnvlist_add_string(schild[schildren], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); fnvlist_add_uint64(schild[schildren], ZPOOL_CONFIG_IS_HOLE, 1); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; VERIFY0(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); schild[schildren++] = fnvlist_dup(mchild[0]); } /* OK, create a config that can be used to split */ split = fnvlist_alloc(); fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); config = fnvlist_alloc(); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); for (c = 0; c < schildren; c++) fnvlist_free(schild[c]); umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); fnvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); (void) pthread_rwlock_unlock(&ztest_name_lock); fnvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); mutex_enter(&spa_namespace_lock); show_pool_stats(spa); show_pool_stats(spa_lookup("splitp")); mutex_exit(&spa_namespace_lock); ++zs->zs_splits; --zs->zs_mirrors; } mutex_exit(&ztest_vdev_lock); } /* * Verify that we can attach and detach devices. */ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; uint64_t raidz_children; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; int newvd_is_dspare = B_FALSE; int oldvd_is_log; int oldvd_is_special; int error, expected_error; if (ztest_opts.zo_mmp_test) return; oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); raidz_children = ztest_get_raidz_children(spa); leaves = MAX(zs->zs_mirrors, 1) * raidz_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * If a vdev is in the process of being removed, its removal may * finish while we are in progress, leading to an unexpected error * value. Don't bother trying to attach while we are in the middle * of removal. */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } /* * RAIDZ leaf VDEV mirrors are not currently supported while a * RAIDZ expansion is in progress. */ if (ztest_opts.zo_raid_do_expand) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); /* * Pick a random top-level vdev. */ top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. */ leaf = ztest_random(leaves); /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / raidz_children]; } /* pick a child out of the raidz group */ if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); oldvd = oldvd->vdev_child[leaf % raidz_children]; } /* * If we're already doing an attach or replace, oldvd may be a * mirror vdev -- in which case, pick a random child. */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; oldvd_is_special = oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. Prior * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); if (error) goto out; error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) fatal(B_FALSE, "detach (%s) returned %d", oldpath, error); goto out; } /* * For the new vdev, choose with equal probability between the two * standard paths (ending in either 'a' or 'b') or a random hot spare. */ if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; if (newvd->vdev_ops == &vdev_draid_spare_ops) newvd_is_dspare = B_TRUE; (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); } if (newvd) { /* * Reopen to ensure the vdev's asize field isn't stale. */ vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. * If it's smaller, the attach should fail. * If it's larger, and we're doing a replace, * we should get dynamic LUN growth when we're done. */ newsize = 10 * oldsize / (9 + ztest_random(3)); } /* * If pvd is not a mirror or root, the attach should fail with ENOTSUP, * unless it's a replace; in that case any non-replacing parent is OK. * * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) expected_error = ENOTSUP; else if (newvd_is_spare && (!replacing || oldvd_is_log || oldvd_is_special)) expected_error = ENOTSUP; else if (newvd == oldvd) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) expected_error = EINVAL; else expected_error = 0; spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); /* * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; if (pvd->vdev_ops == &vdev_mirror_ops || pvd->vdev_ops == &vdev_root_ops) { rebuilding = !!ztest_random(2); } error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); fnvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, * then instead of failing with ENOTSUP we may either succeed, * fail with ENODEV, or fail with EOVERFLOW. */ if (expected_error == ENOTSUP && (error == 0 || error == ENODEV || error == EOVERFLOW)) expected_error = error; /* * If someone grew the LUN, the replacement may be too small. */ if (error == EOVERFLOW || error == EBUSY) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT || error == ZFS_ERR_RESILVER_IN_PROGRESS || error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; if (error != expected_error && expected_error != EBUSY) { fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); } out: mutex_exit(&ztest_vdev_lock); umem_free(oldpath, MAXPATHLEN); umem_free(newpath, MAXPATHLEN); } static void raidz_scratch_verify(void) { spa_t *spa; uint64_t write_size, logical_size, offset; raidz_reflow_scratch_state_t state; vdev_raidz_expand_t *vre; vdev_t *raidvd; ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) return; kernel_init(SPA_MODE_READ); mutex_enter(&spa_namespace_lock); spa = spa_lookup(ztest_opts.zo_pool); ASSERT(spa); spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; mutex_exit(&spa_namespace_lock); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); vre = spa->spa_raidz_expand; if (vre == NULL) goto out; raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); offset = RRSS_GET_OFFSET(&spa->spa_uberblock); state = RRSS_GET_STATE(&spa->spa_uberblock); write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, uint64_t); logical_size = write_size * raidvd->vdev_children; switch (state) { /* * Initial state of reflow process. RAIDZ expansion was * requested by user, but scratch object was not created. */ case RRSS_SCRATCH_NOT_IN_USE: ASSERT3U(offset, ==, 0); break; /* * Scratch object was synced and stored in boot area. */ case RRSS_SCRATCH_VALID: /* * Scratch object was synced back to raidz start offset, * raidz is ready for sector by sector reflow process. */ case RRSS_SCRATCH_INVALID_SYNCED: /* * Scratch object was synced back to raidz start offset * on zpool importing, raidz is ready for sector by sector * reflow process. */ case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: ASSERT3U(offset, ==, logical_size); break; /* * Sector by sector reflow process started. */ case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: ASSERT3U(offset, >=, logical_size); break; } out: spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&ztest_vdev_lock); ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; spa_close(spa, FTAG); kernel_fini(); } static void ztest_scratch_thread(void *arg) { (void) arg; /* wait up to 10 seconds */ for (int t = 100; t > 0; t -= 1) { if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) thread_exit(); (void) poll(NULL, 0, 100); } /* killed when the scratch area progress reached a certain point */ ztest_kill(ztest_shared); } /* * Verify that we can attach raidz device. */ void ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); kthread_t *scratch_thread = NULL; vdev_t *newvd, *pvd; nvlist_t *root; char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); int error, expected_error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); /* Only allow attach when raid-kind = 'eraidz' */ if (!ztest_opts.zo_raid_do_expand) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } if (ztest_opts.zo_mmp_test) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } pvd = vdev_lookup_top(spa, 0); ASSERT(pvd->vdev_ops == &vdev_raidz_ops); /* * Get size of a child of the raidz group, * make sure device is a bit bigger */ newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); /* * Get next attached leaf id */ raidz_children = ztest_get_raidz_children(spa); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; if (spa->spa_raidz_expand) expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; spa_config_exit(spa, SCL_ALL, FTAG); /* * Path to vdev to be attached */ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 0, 0, 1); /* * 50% of the time, set raidz_expand_pause_point to cause * raidz_reflow_scratch_sync() to pause at a certain point and * then kill the test after 10 seconds so raidz_scratch_verify() * can confirm consistency when the pool is imported. */ if (ztest_random(2) == 0 && expected_error == 0) { raidz_expand_pause_point = ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); nvlist_free(root); if (error == EOVERFLOW || error == ENXIO || error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT) expected_error = error; if (error != 0 && error != expected_error) { fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", newpath, newsize, error, expected_error); } if (raidz_expand_pause_point) { if (error != 0) { /* * Do not verify scratch object in case of error * returned by vdev attaching. */ raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; } VERIFY0(thread_join(scratch_thread)); } out: mutex_exit(&ztest_vdev_lock); umem_free(newpath, MAXPATHLEN); } void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; int error; mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } /* * Remove a random top-level vdev and wait for removal to finish. */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); error = spa_vdev_remove(spa, guid, B_FALSE); if (error == 0) { ztest_device_removal_active = B_TRUE; mutex_exit(&ztest_vdev_lock); /* * spa->spa_vdev_removal is created in a sync task that * is initiated via dsl_sync_task_nowait(). Since the * task may not run before spa_vdev_remove() returns, we * must wait at least 1 txg to ensure that the removal * struct has been created. */ txg_wait_synced(spa_get_dsl(spa), 0); while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); } else { mutex_exit(&ztest_vdev_lock); return; } /* * The pool needs to be scrubbed after completing device removal. * Failure to do so may result in checksum errors due to the * strategy employed by ztest_fault_inject() when selecting which * offset are redundant and can be damaged. */ error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } mutex_enter(&ztest_vdev_lock); ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } /* * Callback function which expands the physical size of the vdev. */ static vdev_t * grow_vdev(vdev_t *vd, void *arg) { spa_t *spa __maybe_unused = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); VERIFY0(ftruncate(fd, *newsize)); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } (void) close(fd); return (NULL); } /* * Callback function which expands a given vdev by calling vdev_online(). */ static vdev_t * online_vdev(vdev_t *vd, void *arg) { (void) arg; spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint64_t guid = vd->vdev_guid; uint64_t generation = spa->spa_config_generation + 1; vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ spa_config_exit(spa, SCL_STATE, spa); error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If vdev_online returned an error or the underlying vdev_open * failed then we abort the expand. The only way to know that * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %u, " "error %d\n", newstate, error); } return (vd); } ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); /* * Since we dropped the lock we need to ensure that we're * still talking to the original vdev. It's possible this * vdev may have been detached/replaced while we were * trying to online it. */ if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %"PRIu64", state %"PRIu64", " "expected gen %"PRIu64", got gen %"PRIu64"\n", guid, tvd->vdev_state, generation, spa->spa_config_generation); } return (vd); } return (NULL); } /* * Traverse the vdev tree calling the supplied function. * We continue to walk the tree until we either have walked all * children or we receive a non-NULL return from the callback. * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ static vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { uint_t c; if (vd->vdev_ops->vdev_op_leaf) { if (func == NULL) return (vd); else return (func(vd, arg)); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) return (cvd); } return (NULL); } /* * Verify that dynamic LUN growth works as expected. */ void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; size_t psize, newsize; uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If there is a vdev removal in progress, it could complete while * we are running, in which case we would not be able to verify * that the metaslab_class space increased (because it decreases * when the device removal completes). */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * If we are under raidz expansion, the test can failed because the * metaslabs count will not increase immediately after the vdev is * expanded. It will happen only after raidz expansion completion. */ if (spa->spa_raidz_expand) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; mg = tvd->vdev_mg; mc = mg->mg_class; old_ms_count = tvd->vdev_ms_count; old_class_space = metaslab_class_get_space(mc); /* * Determine the size of the first leaf vdev associated with * our top-level device. */ vd = vdev_walk_tree(tvd, NULL, NULL); ASSERT3P(vd, !=, NULL); ASSERT(vd->vdev_ops->vdev_op_leaf); psize = vd->vdev_psize; /* * We only try to expand the vdev if it's healthy, less than 4x its * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } ASSERT3U(psize, >, 0); newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } /* * Growing the vdev is a two step process: * 1). expand the physical size (i.e. relabel) * 2). online the vdev to create the new metaslabs */ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } spa_config_exit(spa, SCL_STATE, spa); /* * Expanding the LUN will update the config asynchronously, * thus we must wait for the async thread to complete any * pending tasks before proceeding. */ for (;;) { boolean_t done; mutex_enter(&spa->spa_async_lock); done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); mutex_exit(&spa->spa_async_lock); if (done) break; txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); } spa_config_enter(spa, SCL_STATE, spa, RW_READER); tvd = spa->spa_root_vdev->vdev_child[top]; new_ms_count = tvd->vdev_ms_count; new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { fatal(B_FALSE, "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", old_ms_count, new_ms_count); } /* * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { fatal(B_FALSE, "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", old_class_space, new_class_space); } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); } /* * Verify that dmu_objset_{create,destroy,open,close} work as expected. */ static void ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { (void) arg, (void) cr; /* * Create the objects common to all ztest datasets. */ VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); } static int ztest_dataset_create(char *dsname) { int err; uint64_t rand; dsl_crypto_params_t *dcp = NULL; /* * 50% of the time, we create encrypted datasets * using a random cipher suite and a hard-coded * wrapping key. */ rand = ztest_random(2); if (rand != 0) { nvlist_t *crypto_args = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); /* slight bias towards the default cipher suite */ rand = ztest_random(ZIO_CRYPT_FUNCTIONS); if (rand < ZIO_CRYPT_AES_128_CCM) rand = ZIO_CRYPT_ON; fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); /* * These parameters aren't really used by the kernel. They * are simply stored so that userspace knows how to load * the wrapping key. */ fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); fnvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, crypto_args, &dcp)); /* * Cycle through all available encryption implementations * to verify interoperability. */ VERIFY0(gcm_impl_set("cycle")); VERIFY0(aes_impl_set("cycle")); fnvlist_free(crypto_args); fnvlist_free(props); } err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, ztest_objset_create_cb, NULL); dsl_crypto_params_free(dcp, !!err); rand = ztest_random(100); if (err || rand < 80) return (err); if (ztest_opts.zo_verbose >= 5) (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } static int ztest_objset_destroy_cb(const char *name, void *arg) { (void) arg; objset_t *os; dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } dmu_objset_disown(os, B_TRUE, FTAG); /* * Destroy the dataset. */ if (strchr(name, '@') != NULL) { error = dsl_destroy_snapshot(name, B_TRUE); if (error != ECHRNG) { /* * The program was executed, but encountered a runtime * error, such as insufficient slop, or a hold on the * dataset. */ ASSERT0(error); } } else { error = dsl_destroy_head(name); if (error == ENOSPC) { /* There could be checkpoint or insufficient slop */ ztest_record_enospc(FTAG); } else if (error != EBUSY) { /* There could be a hold on this dataset */ ASSERT0(error); } } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } if (error != 0 && error != EEXIST && error != ECHRNG) { fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", osname, id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT && error != ECHRNG) fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); } void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) zd; ztest_ds_t *zdtmp; int iters; int error; objset_t *os, *os2; char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; int i; zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", ztest_opts.zo_pool, id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dsl_destroy_head() * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os) == 0) { ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); dmu_objset_disown(os, B_TRUE, FTAG); } /* * There may be an old instance of the dataset we're about to * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. * It may still be present if the destroy above fails with ENOSPC. */ error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os); if (error == 0) { dmu_objset_disown(os, B_TRUE, FTAG); ztest_record_enospc(FTAG); goto out; } VERIFY3U(ENOENT, ==, error); /* * Verify that we can create a new dataset. */ error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os)); ztest_zd_init(zdtmp, NULL, os); /* * Open the intent log for it. */ zilog = zil_open(os, ztest_get_data, NULL); /* * Put some objects in there, do a little I/O to them, * and randomly take a couple of snapshots along the way. */ iters = ztest_random(5); for (i = 0; i < iters; i++) { ztest_dmu_object_alloc_free(zdtmp, id); if (ztest_random(iters) == 0) (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ VERIFY3U(EEXIST, ==, dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); /* * Verify that we can hold an objset that is also owned. */ VERIFY0(dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* * Verify that we cannot own an objset that is already owned. */ VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os2)); zil_close(zilog); dmu_objset_disown(os, B_TRUE, FTAG); ztest_zd_fini(zdtmp); out: (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(zdtmp, sizeof (ztest_ds_t)); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Cleanup non-standard snapshots and clones. */ static void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", osname, id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", osname, id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", clone1name, id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", osname, id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", clone1name, id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", snap1name, error); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } /* * Verify dsl_dataset_promote handles EBUSY */ void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { objset_t *os; char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; char *osname = zd->zd_name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", osname, id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", osname, id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", clone1name, id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", osname, id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", clone1name, id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); } error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os); if (error) fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, B_TRUE, FTAG); ztest_record_enospc(FTAG); goto out; } if (error != EBUSY) fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); dmu_objset_disown(os, B_TRUE, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 4 /* * Verify that dmu_object_{alloc,free} work as expected. */ void ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; int batchsize; int size; int b; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); batchsize = OD_ARRAY_SIZE; for (b = 0; b < batchsize; b++) ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); /* * Destroy the previous batch of objects, create a new batch, * and do some I/O on the new objects. */ if (ztest_object_init(zd, od, size, B_TRUE) != 0) { zd->zd_od = NULL; umem_free(od, size); return; } while (ztest_random(4 * batchsize) != 0) ztest_io(zd, od[ztest_random(batchsize)].od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); umem_free(od, size); } /* * Rewind the global allocator to verify object allocation backfilling. */ void ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; uint64_t object; /* * Rewind the global allocator randomly back to a lower object number * to force backfilling and reclamation of recently freed dnodes. */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t); mutex_exit(&os->os_obj_lock); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 /* * Verify that dmu_{read,write} work as expected. */ void ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { int size; ztest_od_t *od; objset_t *os = zd->zd_os; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); dmu_tx_t *tx; int freeit, error; uint64_t i, n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is arbitrary. It doesn't have to be a power of two, * and it doesn't have any relation to the object blocksize. * The only requirement is that it can hold at least two bufwads. * * Normally, we write the bufwad to each of these locations. * However, free_percent of the time we instead write zeroes to * packobj and perform a dmu_free_range() on bigobj. By comparing * bigobj to packobj, we can verify that the DMU is correctly * tracking which parts of an object are allocated and free, * and that the contents of the allocated blocks are correct. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; ASSERT3U(chunksize, ==, od[1].od_gen); /* * Prefetch a random chunk of the big object. * Our aim here is to get some async reads in flight * for blocks that we may free below; the DMU should * handle this race correctly. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); /* * free_percent of the time, free a range of bigobj rather than * overwriting it. */ freeit = (ztest_random(100) < free_percent); /* * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); /* This accounts for setting the checksum/compression. */ dmu_tx_hold_bonus(tx, bigobj); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); return; } enum zio_checksum cksum; do { cksum = (enum zio_checksum) ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); dmu_object_set_checksum(os, bigobj, cksum, tx); enum zio_compress comp; do { comp = (enum zio_compress) ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(B_FALSE, "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(B_FALSE, "wrong index: " "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigH mismatch in %p/%p", pack, bigH); if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigT mismatch in %p/%p", pack, bigT); if (freeit) { memset(pack, 0, sizeof (bufwad_t)); } else { pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); } *bigH = *pack; *bigT = *pack; } /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); } static void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { uint64_t i; bufwad_t *pack; bufwad_t *bigH; bufwad_t *bigT; /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(B_FALSE, "future leak: got %"PRIx64", open txg is %"PRIx64"", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(B_FALSE, "wrong index: " "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", pack->bw_index, n, i); if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigH mismatch in %p/%p", pack, bigH); if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(B_FALSE, "pack/bigT mismatch in %p/%p", pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); *bigH = *pack; *bigT = *pack; } } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 void ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; uint64_t i; int error; int size; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t blocksize = ztest_random_blocksize(); uint64_t chunksize = blocksize; uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 9; dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is set equal to bigobj block size so that * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; ASSERT3U(chunksize, ==, od[1].od_gen); VERIFY0(dmu_object_info(os, bigobj, &doi)); VERIFY(ISP2(doi.doi_data_block_size)); VERIFY3U(chunksize, ==, doi.doi_data_block_size); VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); /* * Iteration 0 test zcopy for DB_UNCACHED dbufs. * Iteration 1 test zcopy to already referenced dbufs. * Iteration 2 test zcopy to dirty dbuf in the same txg. * Iteration 3 test zcopy to dbuf dirty in previous txg. * Iteration 4 test zcopy when dbuf is no longer dirty. * Iteration 5 test zcopy when it can't be done. * Iteration 6 one more zcopy write. */ for (i = 0; i < 7; i++) { uint64_t j; uint64_t off; /* * In iteration 5 (i == 5) use arcbufs * that don't match bigobj blksz to test * dmu_assign_arcbuf_by_dbuf() when it can't directly * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { bigbuf_arcbufs[2 * j] = dmu_request_arcbuf(bonus_db, chunksize / 2); bigbuf_arcbufs[2 * j + 1] = dmu_request_arcbuf(bonus_db, chunksize / 2); } } /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( bigbuf_arcbufs[2 * j]); dmu_return_arcbuf( bigbuf_arcbufs[2 * j + 1]); } } umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); dmu_buf_rele(bonus_db, FTAG); return; } /* * 50% of the time don't read objects in the 1st iteration to * test dmu_assign_arcbuf_by_dbuf() for the case when there are * no existing dbufs for the specified offsets. */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %"PRIx64" size %"PRIx64"" " txg %"PRIx64"\n", bigoff, bigsize, txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { memcpy(bigbuf_arcbufs[j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize); } else { memcpy(bigbuf_arcbufs[2 * j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize / 2); memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, (caddr_t)bigbuf + (off - bigoff) + chunksize / 2, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[j], tx)); } else { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[2 * j], tx)); VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off + chunksize / 2, bigbuf_arcbufs[2 * j + 1], tx)); } if (i == 1) { dmu_buf_rele(dbt, FTAG); } } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } if (i == 2) { txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); } else if (i == 3) { txg_wait_synced(dmu_objset_pool(os), 0); } } dmu_buf_rele(bonus_db, FTAG); umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); } void ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) { (void) id; ztest_od_t *od; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); uint64_t offset = (1ULL << (ztest_random(20) + 43)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* * Have multiple threads write to large offsets in an object * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) return; while (ztest_random(10) != 0) ztest_io(zd, od->od_object, offset); umem_free(od, sizeof (ztest_od_t)); } void ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); uint64_t count = ztest_random(20) + 1; uint64_t blocksize = ztest_random_blocksize(); void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } ztest_prealloc(zd, od->od_object, offset, count * blocksize); data = umem_zalloc(blocksize, UMEM_NOFAIL); while (ztest_random(count) != 0) { uint64_t randoff = offset + (ztest_random(count) * blocksize); if (ztest_write(zd, od->od_object, randoff, blocksize, data) != 0) break; while (ztest_random(4) != 0) ztest_io(zd, od->od_object, randoff); } umem_free(data, blocksize); umem_free(od, sizeof (ztest_od_t)); } /* * Verify that zap_{create,destroy,add,remove,update} work as expected. */ #define ZTEST_ZAP_MIN_INTS 1 #define ZTEST_ZAP_MAX_INTS 4 #define ZTEST_ZAP_MAX_PROPS 1000 void ztest_zap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; dmu_tx_t *tx; char propname[100], txgname[100]; int error; const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Generate a known hash collision, and verify that * we can lookup and remove both entries. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; for (i = 0; i < 2; i++) { value[i] = i; VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); VERIFY0( zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { VERIFY0(zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); /* * Generate a bunch of random entries. */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%"PRIu64"", prop); (void) sprintf(txgname, "txg_%"PRIu64"", prop); memset(value, 0, sizeof (value)); last_txg = 0; /* * If these zap entries already exist, validate their contents. */ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == 0) { ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); VERIFY0(zap_lookup(os, object, txgname, zl_intsize, zl_ints, &last_txg)); VERIFY0(zap_length(os, object, propname, &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); VERIFY0(zap_lookup(os, object, propname, zl_intsize, zl_ints, value)); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); } } else { ASSERT3U(error, ==, ENOENT); } /* * Atomically update two entries in our zap object. * The first is named txg_%llu, and contains the txg * in which the property was last updated. The second * is named prop_%llu, and the nth element of its value * should be txg + object + n. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; if (last_txg > txg) fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); /* * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%"PRIu64"", prop); (void) sprintf(txgname, "txg_%"PRIu64"", prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) goto out; ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; VERIFY0(zap_remove(os, object, txgname, tx)); VERIFY0(zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); out: umem_free(od, sizeof (ztest_od_t)); } /* * Test case to test the upgrading of a microzap to fatzap. */ void ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object, txg, value; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Add entries to this ZAP and make sure it spills over * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (value = 0; value < 2050; value++) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_tx_t *tx; int error; (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", id, value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; error = zap_add(os, object, name, sizeof (uint64_t), 1, &value, tx); ASSERT(error == 0 || error == EEXIST); dmu_tx_commit(tx); } out: umem_free(od, sizeof (ztest_od_t)); } void ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; dmu_tx_t *tx; int i, namelen, error; int micro = ztest_random(2); char name[20], string_value[20]; void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } object = od->od_object; /* * Generate a random name of the form 'xxx.....' where each * x is a random printable character and the dots are dots. * There are 94 such characters, and the name length goes from * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. */ namelen = ztest_random(sizeof (name) - 5) + 5 + 1; for (i = 0; i < 3; i++) name[i] = '!' + ztest_random('~' - '!' + 1); for (; i < namelen - 1; i++) name[i] = '.'; name[i] = '\0'; if ((namelen & 1) || micro) { wsize = sizeof (txg); wc = 1; data = &txg; } else { wsize = 1; wc = namelen; data = string_value; } count = -1ULL; VERIFY0(zap_count(os, object, &count)); ASSERT3S(count, !=, -1ULL); /* * Select an operation: length, lookup, add, update, remove. */ i = ztest_random(5); if (i >= 2) { tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(od, sizeof (ztest_od_t)); return; } memcpy(string_value, name, namelen); } else { tx = NULL; txg = 0; memset(string_value, 0, namelen); } switch (i) { case 0: error = zap_length(os, object, name, &zl_wsize, &zl_wc); if (error == 0) { ASSERT3U(wsize, ==, zl_wsize); ASSERT3U(wc, ==, zl_wc); } else { ASSERT3U(error, ==, ENOENT); } break; case 1: error = zap_lookup(os, object, name, wsize, wc, data); if (error == 0) { if (data == string_value && memcmp(name, data, namelen) != 0) fatal(B_FALSE, "name '%s' != val '%s' len %d", name, (char *)data, namelen); } else { ASSERT3U(error, ==, ENOENT); } break; case 2: error = zap_add(os, object, name, wsize, wc, data, tx); ASSERT(error == 0 || error == EEXIST); break; case 3: VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); break; case 4: error = zap_remove(os, object, name, tx); ASSERT(error == 0 || error == ENOENT); break; } if (tx != NULL) dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Commit callback data. */ typedef struct ztest_cb_data { list_node_t zcd_node; uint64_t zcd_txg; int zcd_expected_err; boolean_t zcd_added; boolean_t zcd_called; spa_t *zcd_spa; } ztest_cb_data_t; /* This is the actual commit callback function */ static void ztest_commit_callback(void *arg, int error) { ztest_cb_data_t *data = arg; uint64_t synced_txg; VERIFY3P(data, !=, NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) fatal(B_FALSE, "commit callback of txg %"PRIu64" called prematurely, " "last synced txg = %"PRIu64"\n", data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; if (error == ECANCELED) { ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* * The private callback data should be destroyed here, but * since we are going to check the zcd_called field after * dmu_tx_abort(), we will destroy it there. */ return; } ASSERT(data->zcd_added); ASSERT3U(data->zcd_txg, !=, 0); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* See if this cb was called more quickly */ if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) zc_min_txg_delay = synced_txg - data->zcd_txg; /* Remove our callback from the list */ list_remove(&zcl.zcl_callbacks, data); (void) mutex_exit(&zcl.zcl_callbacks_lock); umem_free(data, sizeof (ztest_cb_data_t)); } /* Allocate and initialize callback data structure */ static ztest_cb_data_t * ztest_create_cb_data(objset_t *os, uint64_t txg) { ztest_cb_data_t *cb_data; cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); cb_data->zcd_txg = txg; cb_data->zcd_spa = dmu_objset_spa(os); list_link_init(&cb_data->zcd_node); return (cb_data); } /* * Commit callback test. */ void ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; ztest_cb_data_t *cb_data[3], *tmp_cb; uint64_t old_txg, txg; int i, error = 0; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } tx = dmu_tx_create(os); cb_data[0] = ztest_create_cb_data(os, 0); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); /* Every once in a while, abort the transaction on purpose */ if (ztest_random(100) == 0) error = -1; if (!error) error = dmu_tx_assign(tx, TXG_NOWAIT); txg = error ? 0 : dmu_tx_get_txg(tx); cb_data[0]->zcd_txg = txg; cb_data[1] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); if (error) { /* * It's not a strict requirement to call the registered * callbacks from inside dmu_tx_abort(), but that's what * it's supposed to happen in the current implementation * so we will check for that. */ for (i = 0; i < 2; i++) { cb_data[i]->zcd_expected_err = ECANCELED; VERIFY(!cb_data[i]->zcd_called); } dmu_tx_abort(tx); for (i = 0; i < 2; i++) { VERIFY(cb_data[i]->zcd_called); umem_free(cb_data[i], sizeof (ztest_cb_data_t)); } umem_free(od, sizeof (ztest_od_t)); return; } cb_data[2] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* * Read existing data to make sure there isn't a future leak. */ VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) fatal(B_FALSE, "future leak: got %"PRIu64", open txg is %"PRIu64"", old_txg, txg); dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* * Since commit callbacks don't have any ordering requirement and since * it is theoretically possible for a commit callback to be called * after an arbitrary amount of time has elapsed since its txg has been * synced, it is difficult to reliably determine whether a commit * callback hasn't been called due to high load or due to a flawed * implementation. * * In practice, we will assume that if after a certain number of txgs a * commit callback hasn't been called, then most likely there's an * implementation bug.. */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { fatal(B_FALSE, "Commit callback threshold exceeded, " "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", tmp_cb->zcd_txg, txg); } /* * Let's find the place to insert our callbacks. * * Even though the list is ordered by txg, it is possible for the * insertion point to not be the end because our txg may already be * quiescing at this point and other callbacks in the open txg * (from other objsets) may have sneaked in. */ tmp_cb = list_tail(&zcl.zcl_callbacks); while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); /* Add the 3 callbacks to the list */ for (i = 0; i < 3; i++) { if (tmp_cb == NULL) list_insert_head(&zcl.zcl_callbacks, cb_data[i]); else list_insert_after(&zcl.zcl_callbacks, tmp_cb, cb_data[i]); cb_data[i]->zcd_added = B_TRUE; VERIFY(!cb_data[i]->zcd_called); tmp_cb = cb_data[i]; } zc_cb_counter += 3; (void) mutex_exit(&zcl.zcl_callbacks_lock); dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Visit each object in the dataset. Verify that its properties * are consistent what was stored in the block tag when it was created, * and that its unused bonus buffer space has not been overwritten. */ void ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) { (void) id; objset_t *os = zd->zd_os; uint64_t obj; int err = 0; for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ztest_block_tag_t *bt = NULL; dmu_object_info_t doi; dmu_buf_t *db; ztest_object_lock(zd, obj, ZTRL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_size >= sizeof (*bt)) bt = ztest_bt_bonus(db); if (bt && bt->bt_magic == BT_MAGIC) { ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, bt->bt_offset, bt->bt_gen, bt->bt_txg, bt->bt_crtxg); ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); } dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, obj); } } void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) id; zfs_prop_t proplist[] = { ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; (void) pthread_rwlock_rdlock(&ztest_name_lock); for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); ASSERT(error == 0 || error == ENOSPC); } int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, ztest_random_blocksize(), (int)ztest_random(2)); ASSERT(error == 0 || error == ENOSPC); (void) pthread_rwlock_unlock(&ztest_name_lock); } void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); fnvlist_free(props); (void) pthread_rwlock_unlock(&ztest_name_lock); } static int user_release_one(const char *snapname, const char *holdname) { nvlist_t *snaps, *holds; int error; snaps = fnvlist_alloc(); holds = fnvlist_alloc(); fnvlist_add_boolean(holds, holdname); fnvlist_add_nvlist(snaps, snapname, holds); fnvlist_free(holds); error = dsl_dataset_user_release(snaps, NULL); fnvlist_free(snaps); return (error); } /* * Test snapshot hold/release and deferred destroy. */ void ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { int error; objset_t *os = zd->zd_os; objset_t *origin; char snapname[100]; char fullname[100]; char clonename[100]; char tag[100]; char osname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *holds; (void) pthread_rwlock_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", osname, id); (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); /* * Clean up from any previous run. */ error = dsl_destroy_head(clonename); if (error != ENOENT) ASSERT0(error); error = user_release_one(fullname, tag); if (error != ESRCH && error != ENOENT) ASSERT0(error); error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != ENOENT) ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); goto out; } fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); fnvlist_add_string(holds, fullname, tag); error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); if (error == ENOSPC) { ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) fatal(B_FALSE, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Inject random faults into the on-disk data. */ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; uint64_t raidz_children; char *path0; char *pathrand; size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; boolean_t injected = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); /* * Device removal is in progress, fault injection must be disabled * until it completes and the pool is scrubbed. The fault injection * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ if (ztest_device_removal_active) goto out; /* * The fault injection strategy for damaging blocks cannot be used * if raidz expansion is in progress. The leaves value * (attached raidz children) is variable and strategy for damaging * blocks will corrupt same data blocks on different child vdevs * because of the reflow process. */ if (spa->spa_raidz_expand != NULL) goto out; maxfaults = MAXFAULTS(zs); raidz_children = ztest_get_raidz_children(spa); leaves = MAX(zs->zs_mirrors, 1) * raidz_children; mirror_save = zs->zs_mirrors; ASSERT3U(leaves, >=, 1); /* * While ztest is running the number of leaves will not change. This * is critical for the fault injection logic as it determines where * errors can be safely injected such that they are always repairable. * * When restarting ztest a different number of leaves may be requested * which will shift the regions to be damaged. This is fine as long * as the pool has been scrubbed prior to using the new mapping. * Failure to do can result in non-repairable damage being injected. */ if (ztest_pool_scrubbed == B_FALSE) goto out; /* * Grab the name lock as reader. There are some operations * which don't like to have their vdevs changed while * they are in progress (i.e. spa_change_guid). Those * operations will have grabbed the name lock as writer. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (ztest_random(2) == 0) { /* * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, * and to the random leaf we selected. We'll induce transient * write failures and random online/offline activity on leaf 0, * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; /* * If the top-level vdev needs to be resilvered * then we only allow faults on the device that is * resilvering. */ if (vd0 != NULL && maxfaults != 1 && (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwritable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll * have enough redundancy. If maxfaults == 1, the * combination of this with injection of random data * corruption below exceeds the pool's fault tolerance. */ vdev_file_t *vf = vd0->vdev_tsd; zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_file->f_fd); vf->vf_file->f_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; } } else { /* * Inject errors on an l2cache device. */ spa_aux_vdev_t *sav = &spa->spa_l2cache; if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); goto out; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); leaf = 0; leaves = 1; maxfaults = INT_MAX; /* no limit on cache devices */ } spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing * with a slog, randomly online/offline vd0. */ if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); /* * We have to grab the zs_name_lock as writer to * prevent a race between offlining a slog and * destroying a dataset. Offlining the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ if (islog) (void) pthread_rwlock_wrlock(&ztest_name_lock); VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); if (islog) (void) pthread_rwlock_unlock(&ztest_name_lock); } else { /* * Ideally we would like to be able to randomly * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from * doing so. */ (void) vdev_online(spa, guid0, 0, NULL); } } if (maxfaults == 0) goto out; /* * We have at least single-fault tolerance, so inject data corruption. */ fd = open(pathrand, O_RDWR); if (fd == -1) /* we hit a gap in the device namespace */ goto out; fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { /* * The offset must be chosen carefully to ensure that * we do not inject a given logical block with errors * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * * To achieve this we divide each leaf device into * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). * Each chunk is further divided into error-injection * ranges (can accept errors) and clear ranges (we do * not inject errors in those). Each error-injection * range can accept errors only for a single leaf vdev. * Error-injection ranges are separated by clear ranges. * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 * 32M to 64M: clear range - no injection allowed * 64M to 96M: injection range for leaf 1 * 96M to 128M: clear range - no injection allowed * 128M to 160M: injection range for leaf 2 * 160M to 192M: clear range - no injection allowed * * Each clear range must be large enough such that a * single block cannot straddle it. This way a block * can't be a target in two different injection ranges * (on different leaf vdevs). */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); /* * Only allow damage to the labels at one end of the vdev. * * If all labels are damaged, the device will be totally * inaccessible, which will result in loss of data, * because we also damage (parts of) the other side of * the mirror/raidz. * * Additionally, we will always have both an even and an * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; /* * The two end labels are stored at the "end" of the disk, but * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), uint64_t); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; if (mirror_save != zs->zs_mirrors) { (void) close(fd); goto out; } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(B_TRUE, "can't inject bad word at 0x%"PRIx64" in %s", offset, pathrand); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%"PRIx64"\n", pathrand, offset); injected = B_TRUE; } (void) close(fd); out: mutex_exit(&ztest_vdev_lock); if (injected && ztest_opts.zo_raid_do_expand) { int error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } } umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). */ static int ztest_scrub_impl(spa_t *spa) { int error = spa_scan(spa, POOL_SCAN_SCRUB); if (error) return (error); while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); if (spa_approx_errlog_size(spa) > 0) return (ECKSUM); ztest_pool_scrubbed = B_TRUE; return (0); } /* * Scrub the pool. */ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error; /* * Scrub in progress by device removal. */ if (ztest_device_removal_active) return; /* * Start a scrub, wait a moment, then force a restart. */ (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } /* * Change the guid for the pool. */ void ztest_reguid(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; uint64_t orig, load; int error; ztest_shared_t *zs = ztest_shared; if (ztest_opts.zo_mmp_test) return; orig = spa_guid(spa); load = spa_load_guid(spa); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_change_guid(spa, NULL); zs->zs_guid = spa_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); if (error != 0) return; if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", orig, spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); VERIFY3U(load, ==, spa_load_guid(spa)); } void ztest_blake3(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; hrtime_t end = gethrtime() + NANOSEC; zio_cksum_salt_t salt; void *salt_ptr = &salt.zcs_bytes; struct abd *abd_data, *abd_meta; void *buf, *templ; int i, *ptr; uint32_t size; BLAKE3_CTX ctx; const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); memset(salt_ptr, 'A', 32); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); while (gethrtime() <= end) { int run_count = 100; zio_cksum_t zc_ref1, zc_ref2; zio_cksum_t zc_res1, zc_res2; void *ref1 = &zc_ref1; void *ref2 = &zc_ref2; void *res1 = &zc_res1; void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ VERIFY0(blake3->setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); Blake3_Final(&ctx, ref1); zc_ref2 = zc_ref1; ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); VERIFY0(blake3->setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); Blake3_Final(&ctx, res1); zc_res2 = zc_res1; ZIO_CHECKSUM_BSWAP(&zc_res2); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); /* Test ABD - data */ templ = abd_checksum_blake3_tmpl_init(&salt); abd_checksum_blake3_native(abd_data, size, templ, &zc_res1); abd_checksum_blake3_byteswap(abd_data, size, templ, &zc_res2); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); /* Test ABD - metadata */ abd_checksum_blake3_native(abd_meta, size, templ, &zc_res1); abd_checksum_blake3_byteswap(abd_meta, size, templ, &zc_res2); abd_checksum_blake3_tmpl_free(templ); VERIFY0(memcmp(ref1, res1, 32)); VERIFY0(memcmp(ref2, res2, 32)); } } abd_free(abd_data); abd_free(abd_meta); umem_free(buf, size); } void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; void *buf; struct abd *abd_data, *abd_meta; uint32_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_byteswap; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_byteswap; fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); fletcher_4_native(buf, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - data */ abd_fletcher_4_byteswap(abd_data, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_data, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - metadata */ abd_fletcher_4_byteswap(abd_meta, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_meta, size, NULL, &zc); VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); } umem_free(buf, size); abd_free(abd_data); abd_free(abd_meta); } } void ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; void *buf; size_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_bswap; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_bswap; size_t pos = 0; ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); while (pos < size) { size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) inc += P2ALIGN_TYPED(ztest_random(64), sizeof (uint32_t), uint64_t); if (inc > (size - pos)) inc = size - pos; fletcher_4_incremental_native(buf + pos, inc, &zc); fletcher_4_incremental_byteswap(buf + pos, inc, &zc_bswap); pos += inc; } VERIFY3U(pos, ==, size); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); /* * verify if incremental on the whole buffer is * equivalent to non-incremental version */ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); fletcher_4_incremental_native(buf, size, &zc); fletcher_4_incremental_byteswap(buf, size, &zc_bswap); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); } umem_free(buf, size); } } void ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa; (void) pthread_rwlock_rdlock(&ztest_name_lock); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ddt_prefetch_all(spa); spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); } static int ztest_set_global_vars(void) { for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { char *kv = ztest_opts.zo_gvars[i]; VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); VERIFY3U(strlen(kv), >, 0); int err = set_global_var(kv); if (ztest_opts.zo_verbose > 0) { (void) printf("setting global var %s ... %s\n", kv, err ? "failed" : "ok"); } if (err != 0) { (void) fprintf(stderr, "failed to set global var '%s'\n", kv); return (err); } } return (0); } static char ** ztest_global_vars_to_zdb_args(void) { char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); char **cur = args; if (args == NULL) return (NULL); for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { *cur++ = (char *)"-o"; *cur++ = ztest_opts.zo_gvars[i]; } ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); *cur = NULL; return (args); } /* The end of strings is indicated by a NULL element */ static char * join_strings(char **strings, const char *sep) { size_t totallen = 0; for (char **sp = strings; *sp != NULL; sp++) { totallen += strlen(*sp); totallen += strlen(sep); } if (totallen > 0) { ASSERT(totallen >= strlen(sep)); totallen -= strlen(sep); } size_t buflen = totallen + 1; char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ o[0] = '\0'; for (char **sp = strings; *sp != NULL; sp++) { size_t would; would = strlcat(o, *sp, buflen); VERIFY3U(would, <, buflen); if (*(sp+1) == NULL) { break; } would = strlcat(o, sep, buflen); VERIFY3U(would, <, buflen); } ASSERT3S(strlen(o), ==, totallen); return (o); } static int ztest_check_path(char *path) { struct stat s; /* return true on success */ return (!stat(path, &s)); } static void ztest_get_zdb_bin(char *bin, int len) { char *zdb_path; /* * Try to use $ZDB and in-tree zdb path. If not successful, just * let popen to search through PATH. */ if ((zdb_path = getenv("ZDB"))) { strlcpy(bin, zdb_path, len); /* In env */ if (!ztest_check_path(bin)) { ztest_dump_core = 0; fatal(B_TRUE, "invalid ZDB '%s'", bin); } return; } VERIFY3P(realpath(getexecname(), bin), !=, NULL); if (strstr(bin, ".libs/ztest")) { strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ strcat(bin, "zdb"); if (ztest_check_path(bin)) return; } strcpy(bin, "zdb"); } static vdev_t * ztest_random_concrete_vdev_leaf(vdev_t *vd) { if (vd == NULL) return (NULL); if (vd->vdev_children == 0) return (vd); vdev_t *eligible[vd->vdev_children]; int eligible_idx = 0, i; for (i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_top->vdev_removing) continue; if (cvd->vdev_children > 0 || (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { eligible[eligible_idx++] = cvd; } } VERIFY3S(eligible_idx, >, 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); } void ztest_initialize(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; case POOL_INITIALIZE_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start initialize %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_INITIALIZE_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } void ztest_trim(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_trim_thread != NULL; zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); uint64_t rate = 1 << ztest_random(30); boolean_t partial = (ztest_random(5) > 0); boolean_t secure = (ztest_random(5) > 0); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, secure, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_TRIM_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; case POOL_TRIM_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start TRIM %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_TRIM_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } +void +ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + + spa_t *spa = ztest_spa; + uint64_t pct = ztest_random(15) + 1; + + (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); +} + /* * Verify pool integrity by running zdb. */ static void ztest_run_zdb(uint64_t guid) { int status; char *bin; char *zdb; char *zbuf; const int len = MAXPATHLEN + MAXNAMELEN + 20; FILE *fp; bin = umem_alloc(len, UMEM_NOFAIL); zdb = umem_alloc(len, UMEM_NOFAIL); zbuf = umem_alloc(1024, UMEM_NOFAIL); ztest_get_zdb_bin(bin, len); char **set_gvars_args = ztest_global_vars_to_zdb_args(); if (set_gvars_args == NULL) { fatal(B_FALSE, "Failed to allocate memory in " "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); } char *set_gvars_args_joined = join_strings(set_gvars_args, " "); free(set_gvars_args); size_t would = snprintf(zdb, len, "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", set_gvars_args_joined, ztest_opts.zo_dir, guid); ASSERT3U(would, <, len); umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", zdb); fp = popen(zdb, "r"); while (fgets(zbuf, 1024, fp) != NULL) if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); if (status == 0) goto out; ztest_dump_core = 0; if (WIFEXITED(status)) fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else fatal(B_FALSE, "'%s' died with signal %d", zdb, WTERMSIG(status)); out: umem_free(bin, len); umem_free(zdb, len); umem_free(zbuf, 1024); } static void ztest_walk_pool_directory(const char *header) { spa_t *spa = NULL; if (ztest_opts.zo_verbose >= 6) (void) puts(header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } static void ztest_spa_import_export(char *oldname, char *newname) { nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; int error; if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } /* * Clean up from previous runs. */ (void) spa_destroy(newname); /* * Get the pool's configuration and guid. */ VERIFY0(spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); ztest_walk_pool_directory("pools before export"); /* * Export it. */ VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); /* * Try to import it. */ newconfig = spa_tryimport(config); ASSERT3P(newconfig, !=, NULL); fnvlist_free(newconfig); /* * Import it under the new name. */ error = spa_import(newname, config, NULL, 0); if (error != 0) { dump_nvlist(config, 0); fatal(B_FALSE, "couldn't import pool %s as %s: error %u", oldname, newname, error); } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. */ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ VERIFY0(spa_open(newname, &spa, FTAG)); ASSERT3U(pool_guid, ==, spa_guid(spa)); spa_close(spa, FTAG); fnvlist_free(config); } static void ztest_resume(spa_t *spa) { if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa); } static __attribute__((noreturn)) void ztest_resume_thread(void *arg) { spa_t *spa = arg; + /* + * Synthesize aged DDT entries for ddt prune testing + */ + ddt_prune_artificial_age = B_TRUE; + if (ztest_opts.zo_verbose >= 3) + ddt_dump_prune_histogram = B_TRUE; + while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); /* * Periodically change the zfs_compressed_arc_enabled setting. */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); /* * Periodically change the zfs_abd_scatter_enabled setting. */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); } static __attribute__((noreturn)) void ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; hrtime_t delay, overdue, last_run = gethrtime(); delay = (zs->zs_thread_stop - zs->zs_thread_start) + MSEC2NSEC(zfs_deadman_synctime_ms); while (!ztest_exiting) { /* * Wait for the delay timer while checking occasionally * if we should stop. */ if (gethrtime() < last_run + delay) { (void) poll(NULL, 0, 1000); continue; } /* * If the pool is suspended then fail immediately. Otherwise, * check to see if the pool is making any progress. If * vdev_deadman() discovers that there hasn't been any recent * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { fatal(B_FALSE, "aborting test after %llu seconds because " "pool has transitioned to a suspended state.", (u_longlong_t)zfs_deadman_synctime_ms / 1000); } vdev_deadman(spa->spa_root_vdev, FTAG); /* * If the process doesn't complete within a grace period of * zfs_deadman_synctime_ms over the expected finish time, * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); if (gethrtime() > overdue) { fatal(B_FALSE, "aborting test after %llu seconds because " "the process is overdue for termination.", (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", (gethrtime() - zs->zs_proc_start) / NANOSEC); last_run = gethrtime(); delay = MSEC2NSEC(zfs_deadman_checktime_ms); } thread_exit(); } static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) { ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); int i; for (i = 0; i < zi->zi_iters; i++) zi->zi_func(zd, id); functime = gethrtime() - functime; atomic_add_64(&zc->zc_count, 1); atomic_add_64(&zc->zc_time, functime); if (ztest_opts.zo_verbose >= 4) (void) printf("%6.2f sec in %s\n", (double)functime / NANOSEC, zi->zi_funcname); } typedef struct ztest_raidz_expand_io { uint64_t rzx_id; uint64_t rzx_amount; uint64_t rzx_bufsize; const void *rzx_buffer; uint64_t rzx_alloc_max; spa_t *rzx_spa; } ztest_expand_io_t; #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 10 /* * Write a request amount of data to some dataset objects. * There will be ztest_opts.zo_threads count of these running in parallel. */ static __attribute__((noreturn)) void ztest_rzx_thread(void *arg) { ztest_expand_io_t *info = (ztest_expand_io_t *)arg; ztest_od_t *od; int batchsize; int od_size; ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; spa_t *spa = info->rzx_spa; od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(od_size, UMEM_NOFAIL); batchsize = OD_ARRAY_SIZE; /* Create objects to write to */ for (int b = 0; b < batchsize; b++) { ztest_od_init(od + b, info->rzx_id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); } if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { umem_free(od, od_size); thread_exit(); } for (uint64_t offset = 0, written = 0; written < info->rzx_amount; offset += info->rzx_bufsize) { /* write to 10 objects */ for (int i = 0; i < batchsize && written < info->rzx_amount; i++) { (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); ztest_write(zd, od[i].od_object, offset, info->rzx_bufsize, info->rzx_buffer); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); written += info->rzx_bufsize; } txg_wait_synced(spa_get_dsl(spa), 0); /* due to inflation, we'll typically bail here */ if (metaslab_class_get_alloc(spa_normal_class(spa)) > info->rzx_alloc_max) { break; } } /* Remove a few objects to leave some holes in allocation space */ mutex_enter(&zd->zd_dirobj_lock); (void) ztest_remove(zd, od, 2); mutex_exit(&zd->zd_dirobj_lock); umem_free(od, od_size); thread_exit(); } static __attribute__((noreturn)) void ztest_thread(void *arg) { int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ if (now > zs->zs_thread_kill && raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { ztest_kill(zs); } /* * If we're getting ENOSPC with some regularity, stop. */ if (zs->zs_enospc_count > 10) break; /* * Pick a random function to execute. */ rand = ztest_random(ZTEST_FUNCS); zi = &ztest_info[rand]; zc = ZTEST_GET_SHARED_CALLSTATE(rand); call_next = zc->zc_next; if (now >= call_next && atomic_cas_64(&zc->zc_next, call_next, call_next + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { ztest_execute(rand, zi, id); } } thread_exit(); } static void ztest_dataset_name(char *dsname, const char *pool, int d) { (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void ztest_dataset_destroy(int d) { char name[ZFS_MAX_DATASET_NAME_LEN]; int t; ztest_dataset_name(name, ztest_opts.zo_pool, d); if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* * Cleanup any non-standard clones and snapshots. In general, * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ for (t = d; t < ztest_opts.zo_threads; t += ztest_opts.zo_datasets) ztest_dsl_dataset_cleanup(name, t); (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } static void ztest_dataset_dirobj_verify(ztest_ds_t *zd) { uint64_t usedobjs, dirobjs, scratch; /* * ZTEST_DIROBJ is the object directory for the entire dataset. * Therefore, the number of objects in use should equal the * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. * If not, we have an object leak. * * Note that we can only check this in ztest_dataset_open(), * when the open-context and syncing-context values agree. * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } static int ztest_dataset_open(int d) { ztest_ds_t *zd = &ztest_ds[d]; uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[ZFS_MAX_DATASET_NAME_LEN]; int error; ztest_dataset_name(name, ztest_opts.zo_pool, d); (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); if (error == ENOSPC) { (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, zd, &os)); (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) fatal(B_FALSE, "missing log records: " "claimed %"PRIu64" < committed %"PRIu64"", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); zil_replay(os, zd, ztest_replay_vector); ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %"PRIu64" blocks, " "%"PRIu64" records, seq %"PRIu64"\n", zd->zd_name, zilog->zl_parse_blk_count, zilog->zl_parse_lr_count, zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data, NULL); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) fatal(B_FALSE, "missing log records: " "replayed %"PRIu64" < committed %"PRIu64"", zilog->zl_replaying_seq, committed_seq); return (0); } static void ztest_dataset_close(int d) { ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_disown(zd->zd_os, B_TRUE, zd); ztest_zd_fini(zd); } static int ztest_replay_zil_cb(const char *name, void *arg) { (void) arg; objset_t *os; ztest_ds_t *zdtmp; VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && ztest_opts.zo_verbose >= 6) { zilog_t *zilog = dmu_objset_zil(os); (void) printf("%s replay %"PRIu64" blocks, " "%"PRIu64" records, seq %"PRIu64"\n", name, zilog->zl_parse_blk_count, zilog->zl_parse_lr_count, zilog->zl_replaying_seq); } umem_free(zdtmp, sizeof (ztest_ds_t)); dmu_objset_disown(os, B_TRUE, FTAG); return (0); } static void ztest_freeze(void) { ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; /* freeze not supported during RAIDZ expansion */ if (ztest_opts.zo_raid_do_expand) return; if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; /* * Force the first log block to be transactionally allocated. * We have to do this before we freeze the pool -- otherwise * the log chain won't be anchored. */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); /* * Freeze the pool. This stops spa_sync() from doing anything, * so that the only way to record changes from now on is the ZIL. */ spa_freeze(spa); /* * Because it is hard to predict how much space a write will actually * require beforehand, we leave ourselves some fudge space to write over * capacity. */ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. * * Run a random number of times less than zo_maxloops and ensure we do * not run out of space on the pool. */ while (ztest_random(10) != 0 && numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); /* * Open and close the pool and dataset to induce log replay. */ raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; txg_wait_synced(spa_get_dsl(spa), 0); ztest_dataset_close(0); ztest_reguid(NULL, 0); spa_close(spa, FTAG); kernel_fini(); } static void ztest_import_impl(void) { importargs_t args = { 0 }; nvlist_t *cfg = NULL; int nsearch = 1; char *searchdirs[nsearch]; int flags = ZFS_IMPORT_MISSING_LOG; searchdirs[0] = ztest_opts.zo_dir; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_FALSE; libpc_handle_t lpch = { .lpc_lib_handle = NULL, .lpc_ops = &libzpool_config_ops, .lpc_printerr = B_TRUE }; VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); fnvlist_free(cfg); } /* * Import a storage pool with the given name. */ static void ztest_import(ztest_shared_t *zs) { spa_t *spa; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; zs->zs_guid = spa_guid(spa); spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(zs->zs_guid); ztest_freeze(); ztest_run_zdb(zs->zs_guid); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } /* * After the expansion was killed, check that the pool is healthy */ static void ztest_raidz_expand_check(spa_t *spa) { ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); /* * Set pool check done flag, main program will run a zdb check * of the pool when we exit. */ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; /* Wait for reflow to finish */ if (ztest_opts.zo_verbose >= 1) { (void) printf("\nwaiting for reflow to finish ...\n"); } pool_raidz_expand_stat_t rzx_stats; pool_raidz_expand_stat_t *pres = &rzx_stats; do { txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 500); /* wait 1/2 second */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) spa_raidz_expand_get_stats(spa, pres); spa_config_exit(spa, SCL_CONFIG, FTAG); } while (pres->pres_state != DSS_FINISHED && pres->pres_reflowed < pres->pres_to_reflow); if (ztest_opts.zo_verbose >= 1) { (void) printf("verifying an interrupted raidz " "expansion using a pool scrub ...\n"); } /* Will fail here if there is non-recoverable corruption detected */ VERIFY0(ztest_scrub_impl(spa)); if (ztest_opts.zo_verbose >= 1) { (void) printf("raidz expansion scrub check complete\n"); } } /* * Start a raidz expansion test. We run some I/O on the pool for a while * to get some data in the pool. Then we grow the raidz and * kill the test at the requested offset into the reflow, verifying that * doing such does not lead to pool corruption. */ static void ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) { nvlist_t *root; pool_raidz_expand_stat_t rzx_stats; pool_raidz_expand_stat_t *pres = &rzx_stats; kthread_t **run_threads; vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; int total_disks = rzvd->vdev_children; int data_disks = total_disks - vdev_get_nparity(rzvd); uint64_t alloc_goal; uint64_t csize; int error, t; int threads = ztest_opts.zo_threads; ztest_expand_io_t *thread_args; ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; /* Setup a 1 MiB buffer of random data */ uint64_t bufsize = 1024 * 1024; void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { fatal(B_TRUE, "short read from /dev/urandom"); } /* * Put some data in the pool and then attach a vdev to initiate * reflow. */ run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), UMEM_NOFAIL); /* Aim for roughly 25% of allocatable space up to 1GB */ alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); if (ztest_opts.zo_verbose >= 1) { (void) printf("adding data to pool '%s', goal %llu bytes\n", ztest_opts.zo_pool, (u_longlong_t)alloc_goal); } /* * Kick off all the I/O generators that run in parallel. */ for (t = 0; t < threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { umem_free(run_threads, threads * sizeof (kthread_t *)); umem_free(buffer, bufsize); return; } thread_args[t].rzx_id = t; thread_args[t].rzx_amount = alloc_goal / threads; thread_args[t].rzx_bufsize = bufsize; thread_args[t].rzx_buffer = buffer; thread_args[t].rzx_alloc_max = alloc_goal; thread_args[t].rzx_spa = spa; run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the writers to complete. */ for (t = 0; t < threads; t++) VERIFY0(thread_join(run_threads[t])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(buffer, bufsize); umem_free(run_threads, threads * sizeof (kthread_t *)); umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); /* Set our reflow target to 25%, 50% or 75% of allocated size */ uint_t multiple = ztest_random(3) + 1; uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; raidz_expand_max_reflow_bytes = reflow_max; if (ztest_opts.zo_verbose >= 1) { (void) printf("running raidz expansion test, killing when " "reflow reaches %llu bytes (%u/4 of allocated space)\n", (u_longlong_t)reflow_max, multiple); } /* XXX - do we want some I/O load during the reflow? */ /* * Use a disk size that is larger than existing ones */ cvd = rzvd->vdev_child[0]; csize = vdev_get_min_asize(cvd); csize += csize / 10; /* * Path to vdev to be attached */ char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), NULL, 0, 0, 1); /* * Expand the raidz vdev by attaching the new disk */ if (ztest_opts.zo_verbose >= 1) { (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, newpath); } error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); nvlist_free(root); if (error != 0) { fatal(0, "raidz expand: attach (%s %llu) returned %d", newpath, (long long)csize, error); } /* * Wait for reflow to begin */ while (spa->spa_raidz_expand == NULL) { txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); /* wait 1/10 second */ } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) spa_raidz_expand_get_stats(spa, pres); spa_config_exit(spa, SCL_CONFIG, FTAG); while (pres->pres_state != DSS_SCANNING) { txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); /* wait 1/10 second */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) spa_raidz_expand_get_stats(spa, pres); spa_config_exit(spa, SCL_CONFIG, FTAG); } ASSERT3U(pres->pres_state, ==, DSS_SCANNING); ASSERT3U(pres->pres_to_reflow, !=, 0); /* * Set so when we are killed we go to raidz checking rather than * restarting test. */ ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; if (ztest_opts.zo_verbose >= 1) { (void) printf("raidz expansion reflow started, waiting for " "%llu bytes to be copied\n", (u_longlong_t)reflow_max); } /* * Wait for reflow maximum to be reached and then kill the test */ while (pres->pres_reflowed < reflow_max) { txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); /* wait 1/10 second */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) spa_raidz_expand_get_stats(spa, pres); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* Reset the reflow pause before killing */ raidz_expand_max_reflow_bytes = 0; if (ztest_opts.zo_verbose >= 1) { (void) printf("killing raidz expansion test after reflow " "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); } /* * Kill ourself to simulate a panic during a reflow. Our parent will * restart the test and the changed flag value will drive the test * through the scrub/check code to verify the pool is not corrupted. */ ztest_kill(zs); } static void ztest_generic_run(ztest_shared_t *zs, spa_t *spa) { kthread_t **run_threads; int t; run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); /* * Kick off all the tests that run in parallel. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); return; } run_threads[t] = thread_create(NULL, 0, ztest_thread, (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ for (t = 0; t < ztest_opts.zo_threads; t++) VERIFY0(thread_join(run_threads[t])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); } /* * Setup our test context and kick off threads to run tests on all datasets * in parallel. */ static void ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; kthread_t *resume_thread, *deadman_thread; uint64_t object; int error; int t, d; ztest_exiting = B_FALSE; /* * Initialize parent/child shared state. */ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); zs->zs_thread_start = gethrtime(); zs->zs_thread_stop = zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; if (ztest_random(100) < ztest_opts.zo_killrate) { zs->zs_thread_kill -= ztest_random(ztest_opts.zo_passtime * NANOSEC); } mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); /* * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { VERIFY3S(error, ==, ENOENT); ztest_import_impl(); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; } metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; /* * XXX - BUGBUG raidz expansion do not run this for generic for now */ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) VERIFY0(vdev_raidz_impl_set("cycle")); dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); dmu_objset_disown(os, B_TRUE, FTAG); /* Give the dedicated raidz expansion test more grace time */ if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) zfs_deadman_synctime_ms *= 2; /* * Create a thread to periodically resume suspended I/O. */ resume_thread = thread_create(NULL, 0, ztest_resume_thread, spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); /* * Create a deadman thread and set to panic if we hang. */ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; /* * Verify that we can safely inquire about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ for (t = 0; t < 64; t++) { for (d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || error == EINVAL); } } /* * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { /* Not expecting ENOSPC errors during raidz expansion tests */ ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_NONE); int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; /* * If we were in the middle of ztest_device_removal() and were killed * we need to ensure the removal and scrub complete before running * any tests that check ztest_device_removal_active. The removal will * be restarted automatically when the spa is opened, but we need to * initiate the scrub manually if it is not already in progress. Note * that we always run the scrub whenever an indirect vdev exists * because we have no way of knowing for sure if ztest_device_removal() * fully completed its scrub before the pool was reimported. * * Does not apply for the RAIDZ expansion specific test runs */ if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && (spa->spa_removing_phys.sr_state == DSS_SCANNING || spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Replay all logs of all datasets in the pool. This is primarily for * temporary datasets which wouldn't otherwise get replayed, which * can trigger failures when attempting to offline a SLOG in * ztest_fault_inject(). */ (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, NULL, DS_FIND_CHILDREN); if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) ztest_raidz_expand_run(zs, spa); else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) ztest_raidz_expand_check(spa); else ztest_generic_run(zs, spa); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ for (object = 1; object < 50; object++) { dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, ZIO_PRIORITY_SYNC_READ); } /* Verify that at least one commit cb was called in a timely fashion */ if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) VERIFY0(zc_min_txg_delay); spa_close(spa, FTAG); /* * Verify that we can loop over all pools. */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); /* * Verify that we can export the pool and reimport it under a * different name. */ if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { char name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s_import", ztest_opts.zo_pool); ztest_spa_import_export(ztest_opts.zo_pool, name); ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); list_destroy(&zcl.zcl_callbacks); mutex_destroy(&zcl.zcl_callbacks_lock); (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; hrtime_t h = m / 60; hrtime_t d = h / 24; s -= m * 60; m -= h * 60; h -= d * 24; timebuf[0] = '\0'; if (d) (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) (void) sprintf(timebuf, "%llum%02llus", m, s); else (void) sprintf(timebuf, "%llus", s); } static nvlist_t * make_random_pool_props(void) { nvlist_t *props; props = fnvlist_alloc(); /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ if (ztest_random(5) == 0) { fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 2 * 1024 * 1024); } /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ if (ztest_random(2) == 0) { fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); } return (props); } /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. */ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; int i; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* * Create the storage pool. */ (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_pool_props(); /* * We don't expect the pool to suspend unless maxfaults == 0, * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); for (i = 0; i < SPA_FEATURES; i++) { char *buf; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; /* * 75% chance of using the log space map feature. We want ztest * to exercise both the code paths that use the log space map * feature and the ones that don't. */ if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; + /* + * split 50/50 between legacy and fast dedup + */ + if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) + continue; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); free(buf); } VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; zs->zs_guid = spa_guid(spa); spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(zs->zs_guid); ztest_freeze(); ztest_run_zdb(zs->zs_guid); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void setup_data_fd(void) { static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; ztest_fd_data = mkstemp(ztest_name_data); ASSERT3S(ztest_fd_data, >=, 0); (void) unlink(ztest_name_data); } static int shared_data_size(ztest_shared_hdr_t *hdr) { int size; size = hdr->zh_hdr_size; size += hdr->zh_opts_size; size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; size += hdr->zh_scratch_state_size; return (size); } static void setup_hdr(void) { int size; ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); hdr->zh_size = sizeof (ztest_shared_t); hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } static void setup_data(void) { int size, offset; ztest_shared_hdr_t *hdr; uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; ztest_shared_opts = (void *)&buf[offset]; offset += hdr->zh_opts_size; ztest_shared = (void *)&buf[offset]; offset += hdr->zh_size; ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; offset += hdr->zh_ds_size * hdr->zh_ds_count; ztest_scratch_state = (void *)&buf[offset]; } static boolean_t exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } if (pid == -1) fatal(B_TRUE, "fork failed"); if (pid == 0) { /* child */ char fd_data_str[12]; VERIFY3S(11, >=, snprintf(fd_data_str, 12, "%d", ztest_fd_data)); VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); if (libpath != NULL) { const char *curlp = getenv("LD_LIBRARY_PATH"); if (curlp == NULL) VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); else { char *newlp = NULL; VERIFY3S(-1, !=, asprintf(&newlp, "%s:%s", libpath, curlp)); VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); free(newlp); } } (void) execl(cmd, cmd, (char *)NULL); ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); } if (cmdbuf != NULL) { umem_free(cmdbuf, MAXPATHLEN); cmd = NULL; } while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) *statusp = status; if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { (void) fprintf(stderr, "child exited with code %d\n", WEXITSTATUS(status)); exit(2); } return (B_FALSE); } else if (WIFSIGNALED(status)) { if (!ignorekill || WTERMSIG(status) != SIGKILL) { (void) fprintf(stderr, "child died with signal %d\n", WTERMSIG(status)); exit(3); } return (B_TRUE); } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); } } static void ztest_run_init(void) { int i; ztest_shared_t *zs = ztest_shared; /* * Blow away any existing copy of zpool.cache */ (void) remove(spa_config_path); if (ztest_opts.zo_init == 0) { if (ztest_opts.zo_verbose >= 1) (void) printf("Importing pool %s\n", ztest_opts.zo_pool); ztest_import(zs); return; } /* * Create and initialize our storage pool. */ for (i = 1; i <= ztest_opts.zo_init; i++) { memset(zs, 0, sizeof (*zs)); if (ztest_opts.zo_verbose >= 3 && ztest_opts.zo_init != 1) { (void) printf("ztest_init(), pass %d\n", i); } ztest_init(zs); } } int main(int argc, char **argv) { int kills = 0; int iters = 0; int older = 0; int newer = 0; ztest_shared_t *zs; ztest_info_t *zi; ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; int f, err; char *fd_data_str = getenv("ZTEST_FD_DATA"); struct sigaction action; (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some * of them so the feature get tested. */ zfs_force_some_double_word_sm_entries = B_TRUE; /* * Verify that even extensively damaged split blocks with many * segments can be reconstructed in a reasonable amount of time * when reconstruction is known to be possible. * * Note: the lower this value is, the more damage we inflict, and * the more time ztest spends in recovering that damage. We chose * to induce damage 1/100th of the time so recovery is tested but * not so frequently that ztest doesn't get to test other code paths. */ zfs_reconstruct_indirect_damage_fraction = 100; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } /* * Force random_get_bytes() to use /dev/urandom in order to prevent * ztest from needlessly depleting the system entropy pool. */ random_path = "/dev/urandom"; ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { process_options(argc, argv); setup_data_fd(); setup_hdr(); setup_data(); memcpy(ztest_shared_opts, &ztest_opts, sizeof (*ztest_shared_opts)); } else { ztest_fd_data = atoi(fd_data_str); setup_data(); memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); err = ztest_set_global_vars(); if (err != 0 && !fd_data_str) { /* error message done by ztest_set_global_vars */ exit(EXIT_FAILURE); } else { /* children should not be spawned if setting gvars fails */ VERIFY3S(err, ==, 0); } /* Override location of zpool.cache */ VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; if (fd_data_str) { metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; if (zs->zs_do_init) ztest_run_init(); else ztest_run(zs); exit(0); } hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, ztest_opts.zo_raid_parity, ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { if (ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest for " "initialization: %s\n", ztest_opts.zo_alt_ztest); } VERIFY(!exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_FALSE, NULL)); } else { VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } zs->zs_do_init = B_FALSE; zs->zs_proc_start = gethrtime(); zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) zc->zc_next = UINT64_MAX; else zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ while (gethrtime() < zs->zs_proc_stop) { int status; boolean_t killed; /* * Initialize the workload counters for each function. */ for (f = 0; f < ZTEST_FUNCS; f++) { zc = ZTEST_GET_SHARED_CALLSTATE(f); zc->zc_count = 0; zc->zc_time = 0; } /* Set the allocation switch size */ zs->zs_metaslab_df_alloc_threshold = ztest_random(zs->zs_metaslab_sz / 4) + 1; if (!hasalt || ztest_random(2) == 0) { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing newer ztest: %s\n", cmd); } newer++; killed = exec_child(cmd, NULL, B_TRUE, &status); } else { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest: %s\n", ztest_opts.zo_alt_ztest); } older++; killed = exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_TRUE, &status); } if (killed) kills++; iters++; if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / (ztest_opts.zo_time * NANOSEC), timebuf); } if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); (void) printf("%7"PRIu64" %9s %s\n", zc->zc_count, timebuf, zi->zi_funcname); } (void) printf("\n"); } if (!ztest_opts.zo_mmp_test) ztest_run_zdb(zs->zs_guid); if (ztest_shared_opts->zo_raidz_expand_test == RAIDZ_EXPAND_CHECKED) break; /* raidz expand test complete */ } if (ztest_opts.zo_verbose >= 1) { if (hasalt) { (void) printf("%d runs of older ztest: %s\n", older, ztest_opts.zo_alt_ztest); (void) printf("%d runs of newer ztest: %s\n", newer, cmd); } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } umem_free(cmd, MAXNAMELEN); return (0); } diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 10083351abb5..d51e4ef003e6 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -1,137 +1,138 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ lib/systemd/system-generators/ lib/systemd/system-preset/ lib/systemd/system/zfs-import-cache.service lib/systemd/system/zfs-import-scan.service lib/systemd/system/zfs-import.target lib/systemd/system/zfs-load-key.service lib/systemd/system/zfs-mount.service lib/systemd/system/zfs-scrub-monthly@.timer lib/systemd/system/zfs-scrub-weekly@.timer lib/systemd/system/zfs-scrub@.service lib/systemd/system/zfs-trim-monthly@.timer lib/systemd/system/zfs-trim-weekly@.timer lib/systemd/system/zfs-trim@.service lib/systemd/system/zfs-share.service lib/systemd/system/zfs-volume-wait.service lib/systemd/system/zfs-volumes.target lib/systemd/system/zfs.target lib/udev/ sbin/fsck.zfs sbin/mount.zfs sbin/zdb sbin/zfs sbin/zfs_ids_to_path sbin/zgenhostid sbin/zhack sbin/zinject sbin/zpool sbin/zstream sbin/zstreamdump usr/bin/zvol_wait usr/lib/modules-load.d/ lib/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/arc_summary usr/sbin/arcstat usr/sbin/dbufstat usr/sbin/zilstat usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions usr/share/man/man1/arcstat.1 usr/share/man/man1/zhack.1 usr/share/man/man1/zvol_wait.1 usr/share/man/man5/ usr/share/man/man8/fsck.zfs.8 usr/share/man/man8/mount.zfs.8 usr/share/man/man8/vdev_id.8 usr/share/man/man8/zdb.8 usr/share/man/man8/zfs-allow.8 usr/share/man/man8/zfs-bookmark.8 usr/share/man/man8/zfs-change-key.8 usr/share/man/man8/zfs-clone.8 usr/share/man/man8/zfs-create.8 usr/share/man/man8/zfs-destroy.8 usr/share/man/man8/zfs-diff.8 usr/share/man/man8/zfs-get.8 usr/share/man/man8/zfs-groupspace.8 usr/share/man/man8/zfs-hold.8 usr/share/man/man8/zfs-inherit.8 usr/share/man/man8/zfs-list.8 usr/share/man/man8/zfs-load-key.8 usr/share/man/man8/zfs-mount-generator.8 usr/share/man/man8/zfs-mount.8 usr/share/man/man8/zfs-program.8 usr/share/man/man8/zfs-project.8 usr/share/man/man8/zfs-projectspace.8 usr/share/man/man8/zfs-promote.8 usr/share/man/man8/zfs-receive.8 usr/share/man/man8/zfs-recv.8 usr/share/man/man8/zfs-redact.8 usr/share/man/man8/zfs-release.8 usr/share/man/man8/zfs-rename.8 usr/share/man/man8/zfs-rollback.8 usr/share/man/man8/zfs-send.8 usr/share/man/man8/zfs-set.8 usr/share/man/man8/zfs-share.8 usr/share/man/man8/zfs-snapshot.8 usr/share/man/man8/zfs-unallow.8 usr/share/man/man8/zfs-unload-key.8 usr/share/man/man8/zfs-unmount.8 usr/share/man/man8/zfs-unzone.8 usr/share/man/man8/zfs-upgrade.8 usr/share/man/man8/zfs-userspace.8 usr/share/man/man8/zfs-wait.8 usr/share/man/man8/zfs-zone.8 usr/share/man/man8/zfs.8 usr/share/man/man8/zfs_ids_to_path.8 usr/share/man/man8/zfs_prepare_disk.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 usr/share/man/man8/zinject.8 usr/share/man/man8/zpool-add.8 usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 +usr/share/man/man8/zpool-ddtprune.8 usr/share/man/man8/zpool-events.8 usr/share/man/man8/zpool-export.8 usr/share/man/man8/zpool-get.8 usr/share/man/man8/zpool-history.8 usr/share/man/man8/zpool-import.8 usr/share/man/man8/zpool-initialize.8 usr/share/man/man8/zpool-iostat.8 usr/share/man/man8/zpool-labelclear.8 usr/share/man/man8/zpool-list.8 usr/share/man/man8/zpool-offline.8 usr/share/man/man8/zpool-online.8 usr/share/man/man8/zpool-prefetch.8 usr/share/man/man8/zpool-reguid.8 usr/share/man/man8/zpool-remove.8 usr/share/man/man8/zpool-reopen.8 usr/share/man/man8/zpool-replace.8 usr/share/man/man8/zpool-resilver.8 usr/share/man/man8/zpool-scrub.8 usr/share/man/man8/zpool-set.8 usr/share/man/man8/zpool-split.8 usr/share/man/man8/zpool-status.8 usr/share/man/man8/zpool-sync.8 usr/share/man/man8/zpool-trim.8 usr/share/man/man8/zpool-upgrade.8 usr/share/man/man8/zpool-wait.8 usr/share/man/man8/zpool.8 usr/share/man/man7/vdevprops.7 usr/share/man/man7/zpoolconcepts.7 usr/share/man/man7/zpoolprops.7 usr/share/man/man8/zstream.8 usr/share/man/man8/zstreamdump.8 usr/share/man/man4/spl.4 usr/share/man/man4/zfs.4 usr/share/man/man7/zpool-features.7 usr/share/man/man8/zpool_influxdb.8 diff --git a/include/libzfs.h b/include/libzfs.h index 2412797541de..01d51999f4eb 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -1,1076 +1,1079 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley */ #ifndef _LIBZFS_H #define _LIBZFS_H extern __attribute__((visibility("default"))) #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXPROPLEN ZAP_MAXVALUELEN #define ZPOOL_MAXPROPLEN ZAP_MAXVALUELEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* failed to unshare over nfs */ EZFS_SHARENFSFAILED, /* failed to share over nfs */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_ERRORSCRUBBING, /* currently error scrubbing */ EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ EZFS_SCRUB_PAUSED_TO_CANCEL, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ EZFS_TOOMANY, /* argument list too long */ EZFS_INITIALIZING, /* currently initializing */ EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ EZFS_TRIMMING, /* currently trimming */ EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_VDEV_NOTSUP, /* ops not supported for this type of vdev */ EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ EZFS_ASHIFT_MISMATCH, /* can't add vdevs with different ashifts */ EZFS_UNKNOWN } zfs_error_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; _LIBZFS_H int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); _LIBZFS_H int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, boolean_t *, boolean_t *); /* * Library initialization */ _LIBZFS_H libzfs_handle_t *libzfs_init(void); _LIBZFS_H void libzfs_fini(libzfs_handle_t *); _LIBZFS_H libzfs_handle_t *zpool_get_handle(zpool_handle_t *); _LIBZFS_H libzfs_handle_t *zfs_get_handle(zfs_handle_t *); _LIBZFS_H void libzfs_print_on_error(libzfs_handle_t *, boolean_t); _LIBZFS_H void zfs_save_arguments(int argc, char **, char *, int); _LIBZFS_H int zpool_log_history(libzfs_handle_t *, const char *); _LIBZFS_H int libzfs_errno(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_init(int); _LIBZFS_H const char *libzfs_error_action(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_description(libzfs_handle_t *); _LIBZFS_H int zfs_standard_error(libzfs_handle_t *, int, const char *); _LIBZFS_H void libzfs_mnttab_init(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_fini(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); _LIBZFS_H int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); _LIBZFS_H void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); _LIBZFS_H void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ _LIBZFS_H zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); _LIBZFS_H zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); _LIBZFS_H void zpool_close(zpool_handle_t *); _LIBZFS_H const char *zpool_get_name(zpool_handle_t *); _LIBZFS_H int zpool_get_state(zpool_handle_t *); _LIBZFS_H const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); _LIBZFS_H const char *zpool_pool_state_to_name(pool_state_t); _LIBZFS_H void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); _LIBZFS_H int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); _LIBZFS_H boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); typedef struct splitflags { /* do not split, but return the config that would be split off */ unsigned int dryrun : 1; /* after splitting, import the pool */ unsigned int import : 1; int name_flags; } splitflags_t; typedef struct trimflags { /* requested vdevs are for the entire pool */ boolean_t fullpool; /* request a secure trim, requires support from device */ boolean_t secure; /* after starting trim, block until trim completes */ boolean_t wait; /* trim at the requested rate in bytes/second */ uint64_t rate; } trimflags_t; /* * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, + uint64_t); + _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); _LIBZFS_H int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int, boolean_t); _LIBZFS_H int zpool_vdev_detach(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove_cancel(zpool_handle_t *); _LIBZFS_H int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); _LIBZFS_H int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); _LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_set_removed_state(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); _LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H nvlist_t *zpool_find_parent_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); _LIBZFS_H int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, const char *prepare_str, char **lines[], int *lines_cnt); _LIBZFS_H int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str, char **lines[], int *lines_cnt); _LIBZFS_H char ** zpool_vdev_script_alloc_env(const char *pool_name, const char *vdev_path, const char *vdev_upath, const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val); _LIBZFS_H void zpool_vdev_script_free_env(char **env); _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); _LIBZFS_H const char *zpool_get_state_str(zpool_handle_t *); /* * Functions to manage pool properties */ _LIBZFS_H int zpool_set_prop(zpool_handle_t *, const char *, const char *); _LIBZFS_H int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); _LIBZFS_H int zpool_get_userprop(zpool_handle_t *, const char *, char *, size_t proplen, zprop_source_t *); _LIBZFS_H uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); _LIBZFS_H int zpool_props_refresh(zpool_handle_t *); _LIBZFS_H const char *zpool_prop_to_name(zpool_prop_t); _LIBZFS_H const char *zpool_prop_values(zpool_prop_t); /* * Functions to manage vdev properties */ _LIBZFS_H int zpool_get_vdev_prop_value(nvlist_t *, vdev_prop_t, char *, char *, size_t, zprop_source_t *, boolean_t); _LIBZFS_H int zpool_get_vdev_prop(zpool_handle_t *, const char *, vdev_prop_t, char *, char *, size_t, zprop_source_t *, boolean_t); _LIBZFS_H int zpool_get_all_vdev_props(zpool_handle_t *, const char *, nvlist_t **); _LIBZFS_H int zpool_set_vdev_prop(zpool_handle_t *, const char *, const char *, const char *); _LIBZFS_H const char *vdev_prop_to_name(vdev_prop_t); _LIBZFS_H const char *vdev_prop_values(vdev_prop_t); _LIBZFS_H boolean_t vdev_prop_user(const char *name); _LIBZFS_H const char *vdev_prop_column_name(vdev_prop_t); _LIBZFS_H boolean_t vdev_prop_align_right(vdev_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. * This must be kept in sync with the zfs_msgid_table in * lib/libzfs/libzfs_status.c. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ ZPOOL_STATUS_ERRATA, /* informational errata available */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; _LIBZFS_H zpool_status_t zpool_get_status(zpool_handle_t *, const char **, zpool_errata_t *); _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, const char **, zpool_errata_t *); /* * Statistics and configuration functions. */ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); _LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *); /* * Import and export functions */ _LIBZFS_H int zpool_export(zpool_handle_t *, boolean_t, const char *); _LIBZFS_H int zpool_export_force(zpool_handle_t *, const char *); _LIBZFS_H int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); _LIBZFS_H int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); _LIBZFS_H void zpool_collect_unsup_feat(nvlist_t *config, char *buf, size_t size); /* * Miscellaneous pool functions */ struct zfs_cmd; _LIBZFS_H const char *const zfs_history_event_names[]; typedef enum { VDEV_NAME_PATH = 1 << 0, VDEV_NAME_GUID = 1 << 1, VDEV_NAME_FOLLOW_LINKS = 1 << 2, VDEV_NAME_TYPE_ID = 1 << 3, } vdev_name_t; _LIBZFS_H char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); _LIBZFS_H int zpool_upgrade(zpool_handle_t *, uint64_t); _LIBZFS_H int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, boolean_t *); _LIBZFS_H int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); _LIBZFS_H int zpool_events_clear(libzfs_handle_t *, int *); _LIBZFS_H int zpool_events_seek(libzfs_handle_t *, uint64_t, int); _LIBZFS_H void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); _LIBZFS_H void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *, char *, size_t); _LIBZFS_H int zpool_checkpoint(zpool_handle_t *); _LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); _LIBZFS_H boolean_t zpool_is_draid_spare(const char *); _LIBZFS_H int zpool_prefetch(zpool_handle_t *, zpool_prefetch_type_t); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ _LIBZFS_H zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); _LIBZFS_H zfs_handle_t *zfs_handle_dup(zfs_handle_t *); _LIBZFS_H void zfs_close(zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_type(const zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_underlying_type(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_name(const zfs_handle_t *); _LIBZFS_H zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ _LIBZFS_H const char *zfs_prop_default_string(zfs_prop_t); _LIBZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _LIBZFS_H const char *zfs_prop_column_name(zfs_prop_t); _LIBZFS_H boolean_t zfs_prop_align_right(zfs_prop_t); _LIBZFS_H nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); _LIBZFS_H const char *zfs_prop_to_name(zfs_prop_t); _LIBZFS_H int zfs_prop_set(zfs_handle_t *, const char *, const char *); _LIBZFS_H int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_prop_set_list_flags(zfs_handle_t *, nvlist_t *, int); _LIBZFS_H int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); _LIBZFS_H int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); _LIBZFS_H uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, const char **); _LIBZFS_H uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); _LIBZFS_H int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); _LIBZFS_H const char *zfs_prop_values(zfs_prop_t); _LIBZFS_H int zfs_prop_is_string(zfs_prop_t prop); _LIBZFS_H nvlist_t *zfs_get_all_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_user_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_recvd_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); _LIBZFS_H int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, boolean_t *, boolean_t *); /* * zfs encryption management */ _LIBZFS_H int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); _LIBZFS_H int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, boolean_t stdin_available, uint8_t **, uint_t *); _LIBZFS_H int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, nvlist_t *); _LIBZFS_H int zfs_crypto_attempt_load_keys(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_crypto_load_key(zfs_handle_t *, boolean_t, const char *); _LIBZFS_H int zfs_crypto_unload_key(zfs_handle_t *); _LIBZFS_H int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; _LIBZFS_H int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); _LIBZFS_H void zfs_prune_proplist(zfs_handle_t *, uint8_t *); _LIBZFS_H int vdev_expand_proplist(zpool_handle_t *, const char *, zprop_list_t **); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ _LIBZFS_H int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **, zfs_type_t, boolean_t); _LIBZFS_H int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); _LIBZFS_H const char *zpool_prop_default_string(zpool_prop_t); _LIBZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _LIBZFS_H const char *zpool_prop_column_name(zpool_prop_t); _LIBZFS_H boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ _LIBZFS_H int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); _LIBZFS_H int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); _LIBZFS_H void zprop_free_list(zprop_list_t *); _LIBZFS_H void zcmd_print_json(nvlist_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct vdev_cbdata { int cb_name_flags; char **cb_names; unsigned int cb_names_count; } vdev_cbdata_t; typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; boolean_t cb_json; zprop_list_t *cb_proplist; zfs_type_t cb_type; vdev_cbdata_t cb_vdevs; nvlist_t *cb_jsobj; boolean_t cb_json_as_int; boolean_t cb_json_pool_key_guid; } zprop_get_cbdata_t; #define ZFS_SET_NOMOUNT 1 typedef struct zprop_set_cbdata { int cb_flags; nvlist_t *cb_proplist; } zprop_set_cbdata_t; _LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); _LIBZFS_H int zprop_nvlist_one_property(const char *, const char *, zprop_source_t, const char *, const char *, nvlist_t *, boolean_t); _LIBZFS_H int zprop_collect_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *, nvlist_t *); /* * Iterator functions. */ #define ZFS_ITER_RECURSE (1 << 0) #define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) #define ZFS_ITER_PROP_LISTSNAPS (1 << 2) #define ZFS_ITER_DEPTH_LIMIT (1 << 3) #define ZFS_ITER_RECVD_PROPS (1 << 4) #define ZFS_ITER_LITERAL_PROPS (1 << 5) #define ZFS_ITER_SIMPLE (1 << 6) typedef int (*zfs_iter_f)(zfs_handle_t *, void *); _LIBZFS_H int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_children_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_dependents_v2(zfs_handle_t *, int, boolean_t, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_filesystems_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_snapshots_v2(zfs_handle_t *, int, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapshots_sorted_v2(zfs_handle_t *, int, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapspec_v2(zfs_handle_t *, int, const char *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_bookmarks_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; } get_all_cb_t; _LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, zfs_iter_f, void *, uint_t); _LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ _LIBZFS_H int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); _LIBZFS_H int zfs_create_ancestors(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_destroy(zfs_handle_t *, boolean_t); _LIBZFS_H int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); _LIBZFS_H int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); _LIBZFS_H int zfs_destroy_snaps_nvl_os(libzfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); _LIBZFS_H int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); _LIBZFS_H int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); typedef struct renameflags { /* recursive rename */ unsigned int recursive : 1; /* don't unmount file systems */ unsigned int nounmount : 1; /* force unmount file systems */ unsigned int forceunmount : 1; } renameflags_t; _LIBZFS_H int zfs_rename(zfs_handle_t *, const char *, renameflags_t); typedef struct sendflags { /* Amount of extra information to print. */ int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; /* for recursive send, skip sending missing snapshots */ boolean_t skipmissing; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* field no longer used, maintained for backwards compatibility */ boolean_t pad; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* show progress as process title (ie. -V) */ boolean_t progressastitle; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; /* compressed WRITE records are permitted */ boolean_t compress; /* raw encrypted records are permitted */ boolean_t raw; /* only send received properties (ie. -b) */ boolean_t backup; /* include snapshot holds in send stream */ boolean_t holds; /* stream represents a partially received dataset */ boolean_t saved; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); _LIBZFS_H int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); _LIBZFS_H int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, const char *); _LIBZFS_H int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); _LIBZFS_H int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); _LIBZFS_H int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); _LIBZFS_H nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); _LIBZFS_H int zfs_promote(zfs_handle_t *); _LIBZFS_H int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); _LIBZFS_H int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); _LIBZFS_H int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); _LIBZFS_H int zfs_get_holds(zfs_handle_t *, nvlist_t **); _LIBZFS_H uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); _LIBZFS_H int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); _LIBZFS_H int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); _LIBZFS_H int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* * Mark the file systems as "resumable" and do not destroy them if the * receive is interrupted */ boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; /* Was holds flag set in the compound header? */ boolean_t holds; /* skip receive of snapshot holds */ boolean_t skipholds; /* mount the filesystem unless nomount is specified */ boolean_t domount; /* force unmount while recv snapshot (private) */ boolean_t forceunmount; /* use this recv to check (and heal if needed) an existing snapshot */ boolean_t heal; } recvflags_t; _LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 1 << 0, ZFS_DIFF_TIMESTAMP = 1 << 1, ZFS_DIFF_CLASSIFY = 1 << 2, ZFS_DIFF_NO_MANGLE = 1 << 3 } diff_flags_t; _LIBZFS_H int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ _LIBZFS_H const char *zfs_type_to_name(zfs_type_t); _LIBZFS_H void zfs_refresh_properties(zfs_handle_t *); _LIBZFS_H int zfs_name_valid(const char *, zfs_type_t); _LIBZFS_H zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_parent_name(zfs_handle_t *, char *, size_t); _LIBZFS_H boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_spa_version(zfs_handle_t *, int *); _LIBZFS_H boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ _LIBZFS_H boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); _LIBZFS_H boolean_t zfs_is_mounted(zfs_handle_t *, char **); _LIBZFS_H int zfs_mount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); _LIBZFS_H int zfs_unmount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_unmountall(zfs_handle_t *, int); _LIBZFS_H int zfs_mount_delegation_check(void); #if defined(__linux__) || defined(__APPLE__) _LIBZFS_H int zfs_parse_mount_options(const char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); _LIBZFS_H void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, char *mntopts, char *mtabopt); #endif /* * Share support functions. * * enum sa_protocol * lists are terminated with SA_NO_PROTOCOL, * NULL means "all/any known to this libzfs". */ #define SA_NO_PROTOCOL -1 _LIBZFS_H boolean_t zfs_is_shared(zfs_handle_t *zhp, char **where, const enum sa_protocol *proto); _LIBZFS_H int zfs_share(zfs_handle_t *zhp, const enum sa_protocol *proto); _LIBZFS_H int zfs_unshare(zfs_handle_t *zhp, const char *mountpoint, const enum sa_protocol *proto); _LIBZFS_H int zfs_unshareall(zfs_handle_t *zhp, const enum sa_protocol *proto); _LIBZFS_H void zfs_commit_shares(const enum sa_protocol *proto); _LIBZFS_H void zfs_truncate_shares(const enum sa_protocol *proto); _LIBZFS_H int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Utility functions to run an external process. */ #define STDOUT_VERBOSE 0x01 #define STDERR_VERBOSE 0x02 #define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ _LIBZFS_H int libzfs_run_process(const char *, char **, int); _LIBZFS_H int libzfs_run_process_get_stdout(const char *, char *[], char *[], char **[], int *); _LIBZFS_H int libzfs_run_process_get_stdout_nopath(const char *, char *[], char *[], char **[], int *); _LIBZFS_H void libzfs_free_str_array(char **, int); _LIBZFS_H boolean_t libzfs_envvar_is_set(const char *); /* * Utility functions for zfs version */ _LIBZFS_H const char *zfs_version_userland(void); _LIBZFS_H char *zfs_version_kernel(void); _LIBZFS_H int zfs_version_print(void); _LIBZFS_H nvlist_t *zfs_version_nvlist(void); /* * Given a device or file, determine if it is part of a pool. */ _LIBZFS_H int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ _LIBZFS_H int zpool_clear_label(int); _LIBZFS_H int zpool_set_bootenv(zpool_handle_t *, const nvlist_t *); _LIBZFS_H int zpool_get_bootenv(zpool_handle_t *, nvlist_t **); /* * Management interfaces for SMB ACL files */ _LIBZFS_H int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ _LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int, uint_t); _LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); /* * Parse a features file for -o compatibility */ typedef enum { ZPOOL_COMPATIBILITY_OK, ZPOOL_COMPATIBILITY_WARNTOKEN, ZPOOL_COMPATIBILITY_BADTOKEN, ZPOOL_COMPATIBILITY_BADFILE, ZPOOL_COMPATIBILITY_NOFILES } zpool_compat_status_t; _LIBZFS_H zpool_compat_status_t zpool_load_compat(const char *, boolean_t *, char *, size_t); #ifdef __FreeBSD__ /* * Attach/detach the given filesystem to/from the given jail. */ _LIBZFS_H int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); /* * Set loader options for next boot. */ _LIBZFS_H int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); #endif /* __FreeBSD__ */ #ifdef __linux__ /* * Add or delete the given filesystem to/from the given user namespace. */ _LIBZFS_H int zfs_userns(zfs_handle_t *zhp, const char *nspath, int attach); #endif #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 206e5e5c2bf6..b1d74fbbc8f5 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -1,168 +1,171 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. */ #ifndef _LIBZFS_CORE_H #define _LIBZFS_CORE_H extern __attribute__((visibility("default"))) #include #include #include #include #ifdef __cplusplus extern "C" { #endif _LIBZFS_CORE_H int libzfs_core_init(void); _LIBZFS_CORE_H void libzfs_core_fini(void); struct zfs_cmd; _LIBZFS_CORE_H int lzc_ioctl_fd(int, unsigned long, struct zfs_cmd *); /* * NB: this type should be kept binary-compatible with dmu_objset_type_t. */ enum lzc_dataset_type { LZC_DATSET_TYPE_ZFS = 2, LZC_DATSET_TYPE_ZVOL }; _LIBZFS_CORE_H int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_clone(const char *, const char *, nvlist_t *); _LIBZFS_CORE_H int lzc_promote(const char *, char *, int); _LIBZFS_CORE_H int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); _LIBZFS_CORE_H int lzc_bookmark(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_bookmark_props(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_unload_key(const char *); _LIBZFS_CORE_H int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_redact(const char *, const char *, nvlist_t *); _LIBZFS_CORE_H int lzc_snaprange_space(const char *, const char *, uint64_t *); _LIBZFS_CORE_H int lzc_hold(nvlist_t *, int, nvlist_t **); _LIBZFS_CORE_H int lzc_release(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_holds(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_props(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, LZC_SEND_FLAG_COMPRESS = 1 << 2, LZC_SEND_FLAG_RAW = 1 << 3, LZC_SEND_FLAG_SAVED = 1 << 4, }; _LIBZFS_CORE_H int lzc_send_wrapper(int (*)(int, void *), int, void *); _LIBZFS_CORE_H int lzc_send(const char *, const char *, int, enum lzc_send_flags); _LIBZFS_CORE_H int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t); _LIBZFS_CORE_H int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); struct dmu_replay_record; _LIBZFS_CORE_H int lzc_send_redacted(const char *, const char *, int, enum lzc_send_flags, const char *); _LIBZFS_CORE_H int lzc_send_resume_redacted(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t, const char *); _LIBZFS_CORE_H int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, boolean_t, int); _LIBZFS_CORE_H int lzc_receive_resumable(const char *, nvlist_t *, const char *, boolean_t, boolean_t, int); _LIBZFS_CORE_H int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *); _LIBZFS_CORE_H int lzc_receive_one(const char *, nvlist_t *, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); _LIBZFS_CORE_H int lzc_send_space_resume_redacted(const char *, const char *, enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, int, uint64_t *); _LIBZFS_CORE_H uint64_t lzc_send_progress(int); _LIBZFS_CORE_H boolean_t lzc_exists(const char *); _LIBZFS_CORE_H int lzc_rollback(const char *, char *, int); _LIBZFS_CORE_H int lzc_rollback_to(const char *, const char *); _LIBZFS_CORE_H int lzc_rename(const char *, const char *); _LIBZFS_CORE_H int lzc_destroy(const char *); _LIBZFS_CORE_H int lzc_channel_program(const char *, const char *, uint64_t, uint64_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_channel_program_nosync(const char *, const char *, uint64_t, uint64_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_sync(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_reopen(const char *, boolean_t); _LIBZFS_CORE_H int lzc_pool_checkpoint(const char *); _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); _LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); _LIBZFS_CORE_H int lzc_pool_prefetch(const char *, zpool_prefetch_type_t); _LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *); _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, + uint64_t); + #ifdef __cplusplus } #endif #endif /* _LIBZFS_CORE_H */ diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 93abad85af44..4e5ccd46318e 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -1,412 +1,415 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_DDT_H #define _SYS_DDT_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct abd; /* * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). */ #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ #define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */ #define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG) /* * DDT on-disk storage object types. Each one corresponds to specific * implementation, see ddt_ops_t. The value itself is not stored on disk. * * When searching for an entry, objects types will be searched in this order. * * Note that DDT_TYPES is used as the "no type" for new entries that have not * yet been written to a storage object. */ typedef enum { DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */ DDT_TYPES } ddt_type_t; _Static_assert(DDT_TYPES <= UINT8_MAX, "ddt_type_t must fit in a uint8_t"); /* New and updated entries recieve this type, see ddt_sync_entry() */ #define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) /* * DDT storage classes. Each class has a separate storage object for each type. * The value itself is not stored on disk. * * When search for an entry, object classes will be searched in this order. * * Note that DDT_CLASSES is used as the "no class" for new entries that have not * yet been written to a storage object. */ typedef enum { DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */ DDT_CLASS_DUPLICATE, /* entry has multiple references */ DDT_CLASS_UNIQUE, /* entry has a single reference */ DDT_CLASSES } ddt_class_t; _Static_assert(DDT_CLASSES < UINT8_MAX, "ddt_class_t must fit in a uint8_t"); /* * The "key" part of an on-disk entry. This is the unique "name" for a block, * that is, that parts of the block pointer that will always be the same for * the same data. */ typedef struct { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ /* * Encoded with logical & physical size, encryption, and compression, * as follows: * +-------+-------+-------+-------+-------+-------+-------+-------+ * | 0 | 0 | 0 |X| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ */ uint64_t ddk_prop; } ddt_key_t; /* * Macros for accessing parts of a ddt_key_t. These are similar to their BP_* * counterparts. */ #define DDK_GET_LSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_LSIZE(ddk, x) \ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) #define DDK_GET_PSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_PSIZE(ddk, x) \ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) #define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 7) #define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 7, x) #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) /* * The "value" part for an on-disk entry. These are the "physical" * characteristics of the stored block, such as its location on disk (DVAs), * birth txg and ref count. * * The "traditional" entry has an array of four, one for each number of DVAs * (copies= property) and another for additional "ditto" copies. Users of the * traditional struct will specify the variant (index) of the one they want. * * The newer "flat" entry has only a single form that is specified using the * DDT_PHYS_FLAT variant. * * Since the value size varies, use one of the size macros when interfacing * with the ddt zap. */ #define DDT_PHYS_MAX (4) /* * Note - this can be used in a flexible array and allocated for * a specific size (ddp_trad or ddp_flat). So be careful not to * copy using "=" assignment but instead use ddt_phys_copy(). */ typedef union { /* * Traditional physical payload value for DDT zap (256 bytes) */ struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; uint64_t ddp_phys_birth; } ddp_trad[DDT_PHYS_MAX]; /* * Flat physical payload value for DDT zap (72 bytes) */ struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; uint64_t ddp_phys_birth; /* txg based from BP */ uint64_t ddp_class_start; /* in realtime seconds */ } ddp_flat; } ddt_univ_phys_t; /* * This enum denotes which variant of a ddt_univ_phys_t to target. For * a traditional DDT entry, it represents the indexes into the ddp_trad * array. Any consumer of a ddt_univ_phys_t needs to know which variant * is being targeted. * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ typedef enum { DDT_PHYS_DITTO = 0, DDT_PHYS_SINGLE = 1, DDT_PHYS_DOUBLE = 2, DDT_PHYS_TRIPLE = 3, DDT_PHYS_FLAT = 4, DDT_PHYS_NONE = 5 } ddt_phys_variant_t; #define DDT_PHYS_VARIANT(ddt, p) \ (ASSERT((p) < DDT_PHYS_NONE), \ ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) #define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) #define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) #define _DDT_PHYS_SWITCH(ddt, flat, trad) \ (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) #define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) #define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) #define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) #define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) /* * A "live" entry, holding changes to an entry made this txg, and other data to * support loading, updating and repairing the entry. */ /* State flags for dde_flags */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ #define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ /* * Additional data to support entry update or repair. This is fixed size * because its relatively rarely used. */ typedef struct { /* copy of data after a repair read, to be rewritten */ abd_t *dde_repair_abd; /* original phys contents before update, for error handling */ ddt_univ_phys_t dde_orig_phys; /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_MAX]; } ddt_entry_io_t; typedef struct { /* key must be first for ddt_key_compare */ ddt_key_t dde_key; /* ddt_tree key */ avl_node_t dde_node; /* ddt_tree_node */ /* storage type and class the entry was loaded from */ ddt_type_t dde_type; ddt_class_t dde_class; uint8_t dde_flags; /* load state flags */ kcondvar_t dde_cv; /* signaled when load completes */ uint64_t dde_waiters; /* count of waiters on dde_cv */ ddt_entry_io_t *dde_io; /* IO support, when required */ ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ } ddt_entry_t; /* * A lightweight entry is for short-lived or transient uses, like iterating or * inspecting, when you don't care where it came from. */ typedef struct { ddt_key_t ddlwe_key; ddt_type_t ddlwe_type; ddt_class_t ddlwe_class; ddt_univ_phys_t ddlwe_phys; } ddt_lightweight_entry_t; /* * In-core DDT log. A separate struct to make it easier to switch between the * appending and flushing logs. */ typedef struct { avl_tree_t ddl_tree; /* logged entries */ uint32_t ddl_flags; /* flags for this log */ uint64_t ddl_object; /* log object id */ uint64_t ddl_length; /* on-disk log size */ uint64_t ddl_first_txg; /* txg log became active */ ddt_key_t ddl_checkpoint; /* last checkpoint */ } ddt_log_t; /* * In-core DDT object. This covers all entries and stats for a the whole pool * for a given checksum type. */ typedef struct { kmutex_t ddt_lock; /* protects changes to all fields */ avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ avl_tree_t ddt_log_tree; /* logged entries */ avl_tree_t ddt_repair_tree; /* entries being repaired */ ddt_log_t ddt_log[2]; /* active/flushing logs */ ddt_log_t *ddt_log_active; /* pointers into ddt_log */ ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ hrtime_t ddt_flush_start; /* log flush start this txg */ uint32_t ddt_flush_pass; /* log flush pass this txg */ int32_t ddt_flush_count; /* entries flushed this txg */ int32_t ddt_flush_min; /* min rem entries to flush */ int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ int32_t ddt_log_flush_rate; /* rolling log flush rate */ int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ uint64_t ddt_flush_force_txg; /* flush hard before this txg */ kstat_t *ddt_ksp; /* kstats context */ enum zio_checksum ddt_checksum; /* checksum algorithm in use */ spa_t *ddt_spa; /* pool this ddt is on */ objset_t *ddt_os; /* ddt objset (always MOS) */ uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ uint64_t ddt_version; /* DDT version */ uint64_t ddt_flags; /* FDT option flags */ /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; /* object ids for stored, logged and per-type/per-class stats */ uint64_t ddt_stat_object; ddt_object_t ddt_log_stats; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; /* log stats power-2-sized referenced blocks */ ddt_histogram_t ddt_log_histogram; } ddt_t; /* * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(), * and is stable across calls, even if the DDT is updated, the pool is * restarted or loaded on another system, or OpenZFS is upgraded. */ typedef struct { uint64_t ddb_class; uint64_t ddb_type; uint64_t ddb_checksum; uint64_t ddb_cursor; } ddt_bookmark_t; extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp); extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, ddt_phys_variant_t v); extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v); extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp); extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v); extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted); extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, const ddt_lightweight_entry_t *ddlwe); extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, const ddt_lightweight_entry_t *ddlwe); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern uint64_t ddt_get_ddt_dsize(spa_t *spa); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); extern void ddt_alloc_entry_io(ddt_entry_t *dde); extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); extern int ddt_key_compare(const void *x1, const void *x2); extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern void ddt_walk_init(spa_t *spa, uint64_t txg); extern boolean_t ddt_walk_ready(spa_t *spa); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe); extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); +extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount); + #ifdef __cplusplus } #endif #endif /* _SYS_DDT_H */ diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 6f11cd90c1d8..4d3c0cae072e 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -1,238 +1,282 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_DDT_IMPL_H #define _SYS_DDT_IMPL_H #include #include #ifdef __cplusplus extern "C" { #endif /* DDT version numbers */ -#define DDT_VERSION_LEGACY (0) -#define DDT_VERSION_FDT (1) +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) /* Names of interesting objects in the DDT root dir */ #define DDT_DIR_VERSION "version" #define DDT_DIR_FLAGS "flags" /* Fill a lightweight entry from a live entry. */ #define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ memset((ddlwe), 0, sizeof (*ddlwe)); \ (ddlwe)->ddlwe_key = (dde)->dde_key; \ (ddlwe)->ddlwe_type = (dde)->dde_type; \ (ddlwe)->ddlwe_class = (dde)->dde_class; \ memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) #define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \ memset((ddlwe), 0, sizeof (*ddlwe)); \ (ddlwe)->ddlwe_key = (ddle)->ddle_key; \ (ddlwe)->ddlwe_type = (ddle)->ddle_type; \ (ddlwe)->ddlwe_class = (ddle)->ddle_class; \ memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) /* * An entry on the log tree. These are "frozen", and a record of what's in * the on-disk log. They can't be used in place, but can be "loaded" back into * the live tree. */ typedef struct { ddt_key_t ddle_key; /* ddt_log_tree key */ avl_node_t ddle_node; /* ddt_log_tree node */ ddt_type_t ddle_type; /* storage type */ ddt_class_t ddle_class; /* storage class */ /* extra allocation for flat/trad phys */ ddt_univ_phys_t ddle_phys[]; } ddt_log_entry_t; /* On-disk log record types. */ typedef enum { DLR_INVALID = 0, /* end of block marker */ DLR_ENTRY = 1, /* an entry to add or replace in the log tree */ } ddt_log_record_type_t; /* On-disk log record header. */ typedef struct { /* * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to * access it. * * bits 0-7: record type (ddt_log_record_type_t) * bits 8-15: length of record header+payload * bits 16-47: reserved, all zero * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type) * otherwise all zero * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class) * otherwise all zero */ uint64_t dlr_info; uint8_t dlr_payload[]; } ddt_log_record_t; #define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8) #define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v) #define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16) #define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v) #define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8) #define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v) #define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8) #define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v) /* Payload for DLR_ENTRY. */ typedef struct { ddt_key_t dlre_key; ddt_univ_phys_t dlre_phys[]; } ddt_log_record_entry_t; /* Log flags (ddl_flags, dlh_flags) */ #define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */ #define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */ /* On-disk log header, stored in the bonus buffer. */ typedef struct { /* * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to * access it. * * bits 0-7: log version * bits 8-15: log flags * bits 16-63: reserved, all zero */ uint64_t dlh_info; uint64_t dlh_length; /* log size in bytes */ uint64_t dlh_first_txg; /* txg this log went active */ ddt_key_t dlh_checkpoint; /* last checkpoint */ } ddt_log_header_t; #define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8) #define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v) #define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8) #define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v) /* DDT log update state */ typedef struct { dmu_tx_t *dlu_tx; /* tx the update is being applied to */ dnode_t *dlu_dn; /* log object dnode */ dmu_buf_t **dlu_dbp; /* array of block buffer pointers */ int dlu_ndbp; /* number of block buffer pointers */ uint16_t dlu_reclen; /* cached length of record */ uint64_t dlu_block; /* block for next entry */ uint64_t dlu_offset; /* offset for next entry */ } ddt_log_update_t; /* * Ops vector to access a specific DDT object type. */ typedef struct { char ddt_op_name[32]; int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_lookup)(objset_t *os, uint64_t object, const ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_contains)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx); int (*ddt_op_remove)(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx); int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; extern const ddt_ops_t ddt_zap_ops; /* Dedup log API */ extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu); extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde, ddt_log_update_t *dlu); extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe); -extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, - const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe); + +extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk); extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx); extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx); extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx); extern int ddt_log_load(ddt_t *ddt); extern void ddt_log_alloc(ddt_t *ddt); extern void ddt_log_free(ddt_t *ddt); extern void ddt_log_init(void); extern void ddt_log_fini(void); /* * These are only exposed so that zdb can access them. Try not to use them * outside of the DDT implementation proper, and if you do, consider moving * them up. */ +/* + * We use a histogram to convert a percentage request into a + * cutoff value where entries older than the cutoff get pruned. + * + * The histogram bins represent hours in power-of-two increments. + * 16 bins covers up to four years. + */ +#define HIST_BINS 16 + +typedef struct ddt_age_histo { + uint64_t dah_entries; + uint64_t dah_age_histo[HIST_BINS]; +} ddt_age_histo_t; + +void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram); + +#if defined(_KERNEL) || !defined(ZFS_DEBUG) +#define ddt_dump_age_histogram(histo, cutoff) ((void)0) +#else +static inline void +ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff) +{ + if (histogram->dah_entries == 0) + return; + + (void) printf("DDT prune unique class age, %llu hour cutoff\n", + (u_longlong_t)(gethrestime_sec() - cutoff)/3600); + (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9llu %4d%%\n", 1<dah_age_histo[i], + (int)((histogram->dah_age_histo[i] * 100) / + histogram->dah_entries)); + } +} +#endif + /* * Enough room to expand DMU_POOL_DDT format for all possible DDT * checksum/class/type combinations. */ #define DDT_NAMELEN 32 extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, char *name); extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, uint64_t *walk, ddt_lightweight_entry_t *ddlwe); extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, uint64_t *count); extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, dmu_object_info_t *); #ifdef __cplusplus } #endif #endif /* _SYS_DDT_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 73d686a002ee..fc4f22cd5304 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,1935 +1,1948 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_INVALID = 0, ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4), ZFS_TYPE_VDEV = (1 << 5), } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZPROP_USERPROP = ZPROP_INVAL, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_ENCRYPTION, ZFS_PROP_KEYLOCATION, ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, ZFS_PROP_PBKDF2_ITERS, ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_PROP_USEROBJUSED, ZFS_PROP_USEROBJQUOTA, ZFS_PROP_GROUPOBJUSED, ZFS_PROP_GROUPOBJQUOTA, ZFS_PROP_PROJECTUSED, ZFS_PROP_PROJECTQUOTA, ZFS_PROP_PROJECTOBJUSED, ZFS_PROP_PROJECTOBJQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; _SYS_FS_ZFS_H const char *const zfs_userquota_prop_prefixes[ ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, ZPOOL_PROP_DEDUPCACHED, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_BOOLEAN_NA 2 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * VDEV properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { VDEV_PROP_INVAL = -1, VDEV_PROP_USERPROP = VDEV_PROP_INVAL, VDEV_PROP_NAME, VDEV_PROP_CAPACITY, VDEV_PROP_STATE, VDEV_PROP_GUID, VDEV_PROP_ASIZE, VDEV_PROP_PSIZE, VDEV_PROP_ASHIFT, VDEV_PROP_SIZE, VDEV_PROP_FREE, VDEV_PROP_ALLOCATED, VDEV_PROP_COMMENT, VDEV_PROP_EXPANDSZ, VDEV_PROP_FRAGMENTATION, VDEV_PROP_BOOTSIZE, VDEV_PROP_PARITY, VDEV_PROP_PATH, VDEV_PROP_DEVID, VDEV_PROP_PHYS_PATH, VDEV_PROP_ENC_PATH, VDEV_PROP_FRU, VDEV_PROP_PARENT, VDEV_PROP_CHILDREN, VDEV_PROP_NUMCHILDREN, VDEV_PROP_READ_ERRORS, VDEV_PROP_WRITE_ERRORS, VDEV_PROP_CHECKSUM_ERRORS, VDEV_PROP_INITIALIZE_ERRORS, VDEV_PROP_OPS_NULL, VDEV_PROP_OPS_READ, VDEV_PROP_OPS_WRITE, VDEV_PROP_OPS_FREE, VDEV_PROP_OPS_CLAIM, VDEV_PROP_OPS_TRIM, VDEV_PROP_BYTES_NULL, VDEV_PROP_BYTES_READ, VDEV_PROP_BYTES_WRITE, VDEV_PROP_BYTES_FREE, VDEV_PROP_BYTES_CLAIM, VDEV_PROP_BYTES_TRIM, VDEV_PROP_REMOVING, VDEV_PROP_ALLOCATING, VDEV_PROP_FAILFAST, VDEV_PROP_CHECKSUM_N, VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, VDEV_PROP_RAIDZ_EXPANDING, VDEV_PROP_SLOW_IO_N, VDEV_PROP_SLOW_IO_T, VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, VDEV_NUM_PROPS } vdev_prop_t; /* * Dataset property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t); _SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop); _SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); _SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t); _SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *); _SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *); _SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t); _SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t); _SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *); _SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *); _SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * VDEV property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H vdev_prop_t vdev_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t vdev_prop_user(const char *name); _SYS_FS_ZFS_H const char *vdev_prop_to_name(vdev_prop_t); _SYS_FS_ZFS_H const char *vdev_prop_default_string(vdev_prop_t); _SYS_FS_ZFS_H uint64_t vdev_prop_default_numeric(vdev_prop_t); _SYS_FS_ZFS_H boolean_t vdev_prop_readonly(vdev_prop_t prop); _SYS_FS_ZFS_H int vdev_prop_index_to_string(vdev_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int vdev_prop_string_to_index(vdev_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H boolean_t zpool_prop_vdev(const char *name); _SYS_FS_ZFS_H uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST, ZFS_REDUNDANT_METADATA_SOME, ZFS_REDUNDANT_METADATA_NONE } zfs_redundant_metadata_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, ZFS_KEYSTATUS_AVAILABLE, } zfs_keystatus_t; typedef enum zfs_keyformat { ZFS_KEYFORMAT_NONE = 0, ZFS_KEYFORMAT_RAW, ZFS_KEYFORMAT_HEX, ZFS_KEYFORMAT_PASSPHRASE, ZFS_KEYFORMAT_FORMATS } zfs_keyformat_t; typedef enum zfs_key_location { ZFS_KEYLOCATION_NONE = 0, ZFS_KEYLOCATION_PROMPT, ZFS_KEYLOCATION_URI, ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; typedef enum { ZFS_PREFETCH_NONE = 0, ZFS_PREFETCH_METADATA = 1, ZFS_PREFETCH_ALL = 2 } zfs_prefetch_type_t; #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * The incrementing pool version number has been replaced by pool feature * flags. For more details, see zfeature.c. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Persistent L2ARC version */ #define L2ARC_PERSISTENT_VERSION_1 1ULL #define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 #define L2ARC_PERSISTENT_VERSION_STRING "1" /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.openzfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" /* Active queue read/write stats */ #define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" #define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" #define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" #define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" #define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" #define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_NONALLOCATING "non_allocating" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_META_ERRORS "verify_meta_errors" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" /* dRAID configuration */ #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_DRAID_MAXPARITY 3 #define VDEV_DRAID_MIN_CHILDREN 2 #define VDEV_DRAID_MAX_CHILDREN UINT8_MAX /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" #define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ "org.openzfs:vdev_rebuild" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ "org.openzfs:raidz_expand_state" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ "org.openzfs:raidz_expand_start_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ "org.openzfs:raidz_expand_end_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ "org.openzfs:raidz_expand_bytes_copied" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" /* vdev initialize state */ #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* vdev TRIM state */ #define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \ "org.zfsonlinux:next_offset_to_trim" #define VDEV_LEAF_ZAP_TRIM_STATE \ "org.zfsonlinux:vdev_trim_state" #define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \ "org.zfsonlinux:vdev_trim_action_time" #define VDEV_LEAF_ZAP_TRIM_RATE \ "org.zfsonlinux:vdev_trim_rate" #define VDEV_LEAF_ZAP_TRIM_PARTIAL \ "org.zfsonlinux:vdev_trim_partial" #define VDEV_LEAF_ZAP_TRIM_SECURE \ "org.zfsonlinux:vdev_trim_secure" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * Settings for zpool compatibility features files */ #define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d" #define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d" #define ZPOOL_COMPAT_MAXSIZE 16384 /* * Hard-wired compatibility settings */ #define ZPOOL_COMPAT_LEGACY "legacy" #define ZPOOL_COMPAT_OFF "off" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; /* * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to * user programs. */ #define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ /* error scrub values stored on disk */ uint64_t pss_error_scrub_func; /* pool_scan_func_t */ uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ uint64_t pss_error_scrub_start; /* error scrub start time */ uint64_t pss_error_scrub_end; /* error scrub end time */ uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ /* error blocks to be issued I/O */ uint64_t pss_error_scrub_to_be_examined; /* error scrub values not stored on disk */ /* error scrub pause time in milliseconds */ uint64_t pss_pass_error_scrub_pause; } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef struct pool_raidz_expand_stat { uint64_t pres_state; /* dsl_scan_state_t */ uint64_t pres_expanding_vdev; uint64_t pres_start_time; uint64_t pres_end_time; uint64_t pres_to_reflow; /* bytes that need to be moved */ uint64_t pres_reflowed; /* bytes moved so far */ uint64_t pres_waiting_for_resilver; } pool_raidz_expand_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; typedef struct vdev_rebuild_stat { uint64_t vrs_state; /* vdev_rebuild_state_t */ uint64_t vrs_start_time; /* time_t */ uint64_t vrs_end_time; /* time_t */ uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ uint64_t vrs_bytes_issued; /* read bytes issued */ uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ uint64_t vrs_bytes_est; /* total bytes to scan */ uint64_t vrs_errors; /* scanning errors */ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers * map to the correct documentation. New errata may only be appended to the * list and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, ZPOOL_ERRATA_ZOL_8308_ENCRYPTION, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and user land as an nvlist uint64 array. * * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in * order to keep subsequent members at their known fixed offsets. When * adding a new field it must be added to the end the structure. */ #define VS_ZIO_TYPES 6 typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initializing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_slow_ios; /* slow IOs */ uint64_t vs_trim_errors; /* trimming errors */ uint64_t vs_trim_notsup; /* supported by device */ uint64_t vs_trim_bytes_done; /* bytes trimmed */ uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ uint64_t vs_rebuild_processed; /* bytes rebuilt */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof (uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) /* * Extended stats * * These are stats which aren't included in the original iostat output. For * convenience, they are grouped together in vdev_stat_ex, although each stat * is individually exported as an nvlist. */ typedef struct vdev_stat_ex { /* Number of ZIOs issued to disk and waiting to finish */ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* Number of ZIOs pending to be issued to disk */ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* * Below are the histograms for various latencies. Buckets are in * units of nanoseconds. */ /* * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in * before this. */ #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; /* Total ZIO latency (ns). Includes queuing and disk access time */ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* Amount of time to read/write the disk (ns) */ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* "lookup the bucket for a value" histogram macros */ #define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ buckets - 1) : 0) #define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) #define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) /* Physical IO histogram */ uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; /* Delegated (aggregated) physical IO histogram */ uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; } vdev_stat_ex_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * TRIM functions. */ typedef enum pool_trim_func { POOL_TRIM_START, POOL_TRIM_CANCEL, POOL_TRIM_SUSPEND, POOL_TRIM_FUNCS } pool_trim_func_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_DEVDIR "/dev" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ #define ZVOL_DIR "/dev/zvol/" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 16384 typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; typedef enum { VDEV_TRIM_NONE, VDEV_TRIM_ACTIVE, VDEV_TRIM_CANCELED, VDEV_TRIM_SUSPENDED, VDEV_TRIM_COMPLETE, } vdev_trim_state_t; typedef enum { VDEV_REBUILD_NONE, VDEV_REBUILD_ACTIVE, VDEV_REBUILD_CANCELED, VDEV_REBUILD_COMPLETE, } vdev_rebuild_state_t; /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl */ #define SNAP_ITER_MIN_TXG "snap_iter_min_txg" #define SNAP_ITER_MAX_TXG "snap_iter_max_txg" /* * /dev/zfs ioctl numbers. * * These numbers cannot change over time. New ioctl numbers must be appended. */ typedef enum zfs_ioc { /* - * Core features - 88/128 numbers reserved. + * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), #endif ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ ZFS_IOC_POOL_STATS, /* 0x5a05 */ ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ ZFS_IOC_POOL_SCAN, /* 0x5a07 */ ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ ZFS_IOC_VDEV_ADD, /* 0x5a0b */ ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ ZFS_IOC_SET_PROP, /* 0x5a16 */ ZFS_IOC_CREATE, /* 0x5a17 */ ZFS_IOC_DESTROY, /* 0x5a18 */ ZFS_IOC_ROLLBACK, /* 0x5a19 */ ZFS_IOC_RENAME, /* 0x5a1a */ ZFS_IOC_RECV, /* 0x5a1b */ ZFS_IOC_SEND, /* 0x5a1c */ ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ ZFS_IOC_ERROR_LOG, /* 0x5a20 */ ZFS_IOC_CLEAR, /* 0x5a21 */ ZFS_IOC_PROMOTE, /* 0x5a22 */ ZFS_IOC_SNAPSHOT, /* 0x5a23 */ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ ZFS_IOC_SET_FSACL, /* 0x5a28 */ ZFS_IOC_GET_FSACL, /* 0x5a29 */ ZFS_IOC_SHARE, /* 0x5a2a */ ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ ZFS_IOC_SMB_ACL, /* 0x5a2c */ ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ ZFS_IOC_HOLD, /* 0x5a30 */ ZFS_IOC_RELEASE, /* 0x5a31 */ ZFS_IOC_GET_HOLDS, /* 0x5a32 */ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ ZFS_IOC_DIFF, /* 0x5a36 */ ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ ZFS_IOC_POOL_REGUID, /* 0x5a3c */ ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ ZFS_IOC_SEND_NEW, /* 0x5a40 */ ZFS_IOC_SEND_SPACE, /* 0x5a41 */ ZFS_IOC_CLONE, /* 0x5a42 */ ZFS_IOC_BOOKMARK, /* 0x5a43 */ ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ ZFS_IOC_RECV_NEW, /* 0x5a46 */ ZFS_IOC_POOL_SYNC, /* 0x5a47 */ ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */ ZFS_IOC_LOAD_KEY, /* 0x5a49 */ ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ ZFS_IOC_POOL_TRIM, /* 0x5a50 */ ZFS_IOC_REDACT, /* 0x5a51 */ ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ ZFS_IOC_WAIT, /* 0x5a53 */ ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ + ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) #ifdef __linux__ /* * IOCTLs to update and retrieve additional file level attributes on * Linux. */ #define ZFS_IOC_GETDOSFLAGS _IOR(0x83, 1, uint64_t) #define ZFS_IOC_SETDOSFLAGS _IOW(0x83, 2, uint64_t) /* * Additional file level attributes, that are stored * in the upper half of z_pflags */ #define ZFS_READONLY 0x0000000100000000ull #define ZFS_HIDDEN 0x0000000200000000ull #define ZFS_SYSTEM 0x0000000400000000ull #define ZFS_ARCHIVE 0x0000000800000000ull #define ZFS_IMMUTABLE 0x0000001000000000ull #define ZFS_NOUNLINK 0x0000002000000000ull #define ZFS_APPENDONLY 0x0000004000000000ull #define ZFS_NODUMP 0x0000008000000000ull #define ZFS_OPAQUE 0x0000010000000000ull #define ZFS_AV_QUARANTINED 0x0000020000000000ull #define ZFS_AV_MODIFIED 0x0000040000000000ull #define ZFS_REPARSE 0x0000080000000000ull #define ZFS_OFFLINE 0x0000100000000000ull #define ZFS_SPARSE 0x0000200000000000ull #define ZFS_DOS_FL_USER_VISIBLE (ZFS_IMMUTABLE | ZFS_APPENDONLY | \ ZFS_NOUNLINK | ZFS_ARCHIVE | ZFS_NODUMP | ZFS_SYSTEM | \ ZFS_HIDDEN | ZFS_READONLY | ZFS_REPARSE | ZFS_OFFLINE | \ ZFS_SPARSE) #endif /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. * * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_IOC_CMD_UNAVAIL, ZFS_ERR_IOC_ARG_UNAVAIL, ZFS_ERR_IOC_ARG_REQUIRED, ZFS_ERR_IOC_ARG_BADTYPE, ZFS_ERR_WRONG_PARENT, ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, ZFS_ERR_RESILVER_IN_PROGRESS, ZFS_ERR_REBUILD_IN_PROGRESS, ZFS_ERR_BADPROP, ZFS_ERR_VDEV_NOTSUP, ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; typedef enum { ZPOOL_WAIT_CKPT_DISCARD, ZPOOL_WAIT_FREE, ZPOOL_WAIT_INITIALIZE, ZPOOL_WAIT_REPLACE, ZPOOL_WAIT_REMOVE, ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; typedef enum { ZFS_WAIT_DELETEQ, ZFS_WAIT_NUM_ACTIVITIES } zfs_wait_activity_t; typedef enum { ZPOOL_PREFETCH_NONE = 0, ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; +typedef enum { + ZPOOL_DDT_PRUNE_NONE, + ZPOOL_DDT_PRUNE_AGE, /* in seconds */ + ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ +} zpool_ddt_prune_unit_t; + /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" #define ZPOOL_HIST_ELAPSED_NS "elapsed_ns" /* * Special nvlist name that will not have its args recorded in the pool's * history log. */ #define ZPOOL_HIDDEN_ARGS "hidden_args" /* * The following is used when invoking ZFS_IOC_POOL_GET_PROPS. */ #define ZPOOL_GET_PROPS_NAMES "get_props_names" /* * Opt-in property names used with ZPOOL_GET_PROPS_NAMES. * For example, properties that are hidden or expensive to compute. */ #define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * The following are names used when invoking ZFS_IOC_POOL_REGUID. */ #define ZPOOL_REGUID_GUID "guid" /* * The following are names used when invoking ZFS_IOC_POOL_TRIM. */ #define ZPOOL_TRIM_COMMAND "trim_command" #define ZPOOL_TRIM_VDEVS "trim_vdevs" #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ #define ZPOOL_WAIT_ACTIVITY "wait_activity" #define ZPOOL_WAIT_TAG "wait_tag" #define ZPOOL_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_VDEV_GET_PROP. */ #define ZPOOL_VDEV_PROPS_GET_VDEV "vdevprops_get_vdev" #define ZPOOL_VDEV_PROPS_GET_PROPS "vdevprops_get_props" /* * The following are names used when invoking ZFS_IOC_VDEV_SET_PROP. */ #define ZPOOL_VDEV_PROPS_SET_VDEV "vdevprops_set_vdev" #define ZPOOL_VDEV_PROPS_SET_PROPS "vdevprops_set_props" /* * The following are names used when invoking ZFS_IOC_WAIT_FS. */ #define ZFS_WAIT_ACTIVITY "wait_activity" #define ZFS_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_POOL_PREFETCH. */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" +/* + * The following are names used when invoking ZFS_IOC_DDT_PRUNE. + */ +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_ONLINE_SPARE 0x10 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 #define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_FINISH * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * The recordsize property can not be set larger than zfs_max_recordsize * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* supported encryption algorithms */ enum zio_encrypt { ZIO_CRYPT_INHERIT = 0, ZIO_CRYPT_ON, ZIO_CRYPT_OFF, ZIO_CRYPT_AES_128_CCM, ZIO_CRYPT_AES_192_CCM, ZIO_CRYPT_AES_256_CCM, ZIO_CRYPT_AES_128_GCM, ZIO_CRYPT_AES_192_GCM, ZIO_CRYPT_AES_256_GCM, ZIO_CRYPT_FUNCTIONS }; #define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* * xattr namespace prefixes. These are forbidden in xattr names. * * For cross-platform compatibility, xattrs in the user namespace should not be * prefixed with the namespace name, but for backwards compatibility with older * ZFS on Linux versions we do prefix the namespace. */ #define ZFS_XA_NS_FREEBSD_PREFIX "freebsd:" #define ZFS_XA_NS_FREEBSD_PREFIX_LEN strlen("freebsd:") #define ZFS_XA_NS_LINUX_SECURITY_PREFIX "security." #define ZFS_XA_NS_LINUX_SECURITY_PREFIX_LEN strlen("security.") #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX "system." #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX_LEN strlen("system.") #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX "trusted." #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX_LEN strlen("trusted.") #define ZFS_XA_NS_LINUX_USER_PREFIX "user." #define ZFS_XA_NS_LINUX_USER_PREFIX_LEN strlen("user.") #define ZFS_XA_NS_PREFIX_MATCH(ns, name) \ (strncmp(name, ZFS_XA_NS_##ns##_PREFIX, \ ZFS_XA_NS_##ns##_PREFIX_LEN) == 0) #define ZFS_XA_NS_PREFIX_FORBIDDEN(name) \ (ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SECURITY, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SYSTEM, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 4fc6f22fcb50..7811abbb9ce3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -1,504 +1,505 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct spa_alloc { kmutex_t spaa_lock; avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; typedef struct spa_allocs_use { kmutex_t sau_lock; uint_t sau_rotor; boolean_t sau_inuse[]; } spa_allocs_use_t; typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { uint64_t sh_pool_create_len; /* ending offset of zpool create */ uint64_t sh_phys_max_off; /* physical EOF */ uint64_t sh_bof; /* logical BOF */ uint64_t sh_eof; /* logical EOF */ uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; /* * All members must be uint64_t, for byteswap purposes. */ typedef struct spa_removing_phys { uint64_t sr_state; /* dsl_scan_state_t */ /* * The vdev ID that we most recently attempted to remove, * or -1 if no removal has been attempted. */ uint64_t sr_removing_vdev; /* * The vdev ID that we most recently successfully removed, * or -1 if no devices have been removed. */ uint64_t sr_prev_indirect_vdev; uint64_t sr_start_time; uint64_t sr_end_time; /* * Note that we can not use the space map's or indirect mapping's * accounting as a substitute for these values, because we need to * count frees of not-yet-copied data as though it did the copy. * Otherwise, we could get into a situation where copied > to_copy, * or we complete before copied == to_copy. */ uint64_t sr_to_copy; /* bytes that need to be copied */ uint64_t sr_copied; /* bytes that have been copied or freed */ } spa_removing_phys_t; /* * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense * of an indirect vdev's mapping object is in progress. */ typedef struct spa_condensing_indirect_phys { /* * The vdev ID of the indirect vdev whose indirect mapping is * being condensed. */ uint64_t scip_vdev; /* * The vdev's old obsolete spacemap. This spacemap's contents are * being integrated into the new mapping. */ uint64_t scip_prev_obsolete_sm_object; /* * The new mapping object that is being created. */ uint64_t scip_next_mapping_object; } spa_condensing_indirect_phys_t; struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ vdev_t **sav_vdevs; /* devices */ int sav_count; /* number devices */ boolean_t sav_sync; /* sync the device list */ boolean_t sav_label_sync; /* sync aux labels */ nvlist_t **sav_pending; /* pending device additions */ uint_t sav_npending; /* # pending devices */ }; typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; int scl_count; kcondvar_t scl_cv; } ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; char *scd_path; } spa_config_dirent_t; typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES } zio_taskq_type_t; /* * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine * PROC_NONE -> PROC_CREATED spa_activate() * PROC_CREATED -> PROC_ACTIVE spa_thread() * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() * PROC_DEACTIVATE -> PROC_GONE spa_thread() * PROC_GONE -> PROC_NONE spa_deactivate() */ typedef enum spa_proc_state { SPA_PROC_NONE, /* spa_proc = &p0, no process created */ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; typedef struct spa_taskqs { uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; /* one for each thread in the spa sync taskq */ typedef struct spa_syncthread_info { kthread_t *sti_thread; uint_t sti_allocator; } spa_syncthread_info_t; typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ AVZ_ACTION_INITIALIZE } spa_avz_action_t; typedef enum spa_config_source { SPA_CONFIG_SRC_NONE = 0, SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ } spa_config_source_t; struct spa { /* * Fields protected by spa_namespace_lock. */ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ uint32_t spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ boolean_t spa_trust_config; /* do we trust vdev tree? */ boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ kthread_t *spa_export_thread; /* valid during pool export */ kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ inode_timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ /* * spa_allocs is an array, whose lengths is stored in spa_alloc_count. * There is one tree and one lock for each allocator, to help improve * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ /* per-allocator sync thread taskqs */ taskq_t *spa_sync_tq; spa_syncthread_info_t *spa_syncthreads; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ boolean_t spa_aux_sync_uber; /* need to sync aux uber */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ /* checksum context templates */ kmutex_t spa_cksum_tmpls_lock; void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ /* in-flight verification bytes */ uint64_t spa_load_verify_bytes; kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ /* error scrub pause time in milliseconds */ uint64_t spa_scan_pass_errorscrub_pause; /* total error scrub paused time in milliseconds */ uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any * vdev_resilver_deferred is set. */ boolean_t spa_resilver_deferred; kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ vdev_raidz_expand_t *spa_raidz_expand; zthr_t *spa_raidz_expand_zthr; uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ uint64_t spa_livelists_to_delete; /* set of livelists to free */ livelist_condense_entry_t spa_to_condense; /* next to condense */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ uint64_t spa_load_txg; /* ub txg that loaded */ uint64_t spa_load_txg_ts; /* timestamp from that ub */ uint64_t spa_load_meta_errors; /* verify metadata err count */ uint64_t spa_load_data_errors; /* verify data err count */ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_deadman_failmode; /* failure mode for deadman */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ zio_suspend_reason_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ boolean_t spa_read_spacemaps; /* spacemaps available if ro */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + boolean_t spa_active_ddt_prune; /* ddt prune process active */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ uintptr_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ uint64_t spa_prev_software_version; /* See ub_software_version */ uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ uint64_t spa_deadman_synctime; /* deadman sync expiration */ uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ uint64_t spa_autotrim; /* automatic background trim? */ uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ spa_keystore_t spa_keystore; /* loaded crypto keys */ /* arc_memory_throttle() parameters during low memory condition */ uint64_t spa_lowmem_page_load; /* memory load during txg */ uint64_t spa_lowmem_last_txg; /* txg window start */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ uint32_t spa_hostid; /* cached system hostid */ /* synchronization for threads in spa_wait */ kmutex_t spa_activities_lock; kcondvar_t spa_activities_cv; kcondvar_t spa_waiters_cv; int spa_waiters; /* number of waiting threads */ boolean_t spa_waiters_cancel; /* waiters should return */ char *spa_compatibility; /* compatibility file(s) */ uint64_t spa_dedup_table_quota; /* property DDT maximum size */ uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */ uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */ /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ }; extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); extern int param_set_deadman_failmode_common(const char *val); extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus } #endif #endif /* _SYS_SPA_IMPL_H */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 87c5c4380be3..88dd8b3c679d 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,9847 +1,9890 @@ - + + + - + - + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - - - - - diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index dfa7c4db6881..14410b153130 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1,5651 +1,5679 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, 2023, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" #include "zfeature_common.h" static boolean_t zpool_vdev_is_interior(const char *name); typedef struct prop_flags { unsigned int create:1; /* Validate property on creation */ unsigned int import:1; /* Validate property on import */ unsigned int vdevprop:1; /* Validate property as a VDEV property */ } prop_flags_t; /* * ==================================================================== * zpool property functions * ==================================================================== */ static int zpool_get_all_props(zpool_handle_t *zhp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zhp->zpool_n_propnames > 0) { nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, zhp->zpool_propnames, zhp->zpool_n_propnames); zcmd_write_src_nvlist(hdl, &zc, innvl); } zcmd_alloc_dst_nvlist(hdl, &zc, 0); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { zcmd_free_nvlists(&zc); return (-1); } zcmd_free_nvlists(&zc); return (0); } int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; old_props = zhp->zpool_props; if (zpool_get_all_props(zhp) != 0) return (-1); nvlist_free(old_props); return (0); } static const char * zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; const char *value; zprop_source_t source; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; if ((value = zpool_prop_default_string(prop)) == NULL) value = "-"; } if (src) *src = source; return (value); } uint64_t zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t value; zprop_source_t source; if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { /* * zpool_get_all_props() has most likely failed because * the pool is faulted, but if all we need is the top level * vdev's guid then get it from the zhp config nvlist. */ if ((prop == ZPOOL_PROP_GUID) && (nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0)) { return (value); } return (zpool_prop_default_numeric(prop)); } nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; value = zpool_prop_default_numeric(prop); } if (src) *src = source; return (value); } /* * Map VDEV STATE to printed strings. */ const char * zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) { switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); case VDEV_STATE_REMOVED: return (gettext("REMOVED")); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); else if (aux == VDEV_AUX_SPLIT_POOL) return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); default: break; } return (gettext("UNKNOWN")); } /* * Map POOL STATE to printed strings. */ const char * zpool_pool_state_to_name(pool_state_t state) { switch (state) { default: break; case POOL_STATE_ACTIVE: return (gettext("ACTIVE")); case POOL_STATE_EXPORTED: return (gettext("EXPORTED")); case POOL_STATE_DESTROYED: return (gettext("DESTROYED")); case POOL_STATE_SPARE: return (gettext("SPARE")); case POOL_STATE_L2CACHE: return (gettext("L2CACHE")); case POOL_STATE_UNINITIALIZED: return (gettext("UNINITIALIZED")); case POOL_STATE_UNAVAIL: return (gettext("UNAVAIL")); case POOL_STATE_POTENTIALLY_ACTIVE: return (gettext("POTENTIALLY_ACTIVE")); } return (gettext("UNKNOWN")); } /* * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED", * "SUSPENDED", etc). */ const char * zpool_get_state_str(zpool_handle_t *zhp) { zpool_errata_t errata; zpool_status_t status; const char *str; status = zpool_get_status(zhp, NULL, &errata); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || status == ZPOOL_STATUS_IO_FAILURE_CONTINUE || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { nvlist_t *nvroot = fnvlist_lookup_nvlist( zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( nvroot, ZPOOL_CONFIG_VDEV_STATS, &vsc); str = zpool_state_to_name(vs->vs_state, vs->vs_aux); } return (str); } /* * Get a zpool property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; zprop_source_t src = ZPROP_SRC_NONE; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { switch (prop) { case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: case ZPOOL_PROP_COMPATIBILITY: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; } zfs_fallthrough; default: (void) strlcpy(buf, "-", len); break; } if (srctype != NULL) *srctype = src; return (0); } /* * ZPOOL_PROP_DEDUPCACHED can be fetched by name only using * the ZPOOL_GET_PROPS_NAMES mechanism */ if (prop == ZPOOL_PROP_DEDUPCACHED) { zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); (void) zpool_get_all_props(zhp); } if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); switch (zpool_prop_get_type(prop)) { case PROP_TYPE_STRING: (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; case PROP_TYPE_NUMBER: intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { case ZPOOL_PROP_DEDUP_TABLE_QUOTA: /* * If dedup quota is 0, we translate this into 'none' * (unless literal is set). And if it is UINT64_MAX * we translate that as 'automatic' (limit to size of * the dedicated dedup VDEV. Otherwise, fall throught * into the regular number formating. */ if (intval == 0) { (void) strlcpy(buf, literal ? "0" : "none", len); break; } else if (intval == UINT64_MAX) { (void) strlcpy(buf, "auto", len); break; } zfs_fallthrough; case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: case ZPOOL_PROP_LEAKED: case ZPOOL_PROP_ASHIFT: case ZPOOL_PROP_MAXBLOCKSIZE: case ZPOOL_PROP_MAXDNODESIZE: case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONEUSED: case ZPOOL_PROP_DEDUP_TABLE_SIZE: case ZPOOL_PROP_DEDUPCACHED: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); else (void) zfs_nicenum(intval, buf, len); break; case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicebytes(intval, buf, len); } break; case ZPOOL_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_BCLONERATIO: case ZPOOL_PROP_DEDUPRATIO: if (literal) (void) snprintf(buf, len, "%llu.%02llu", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); else (void) snprintf(buf, len, "%llu.%02llux", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_VERSION: if (intval >= SPA_VERSION_FEATURES) { (void) snprintf(buf, len, "-"); break; } zfs_fallthrough; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: intval = zpool_get_prop_int(zhp, prop, &src); if (zpool_prop_index_to_string(prop, intval, &strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Get a zpool property value for 'propname' and return the value in * a pre-allocated buffer. */ int zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf, size_t len, zprop_source_t *srctype) { nvlist_t *nv, *nvl; uint64_t ival; const char *value; zprop_source_t source = ZPROP_SRC_LOCAL; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) { if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0) source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; value = "-"; } if (srctype) *srctype = source; (void) strlcpy(buf, value, len); return (0); } /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. */ static boolean_t bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') return (B_TRUE); if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) return (B_FALSE); if (strncmp(pool, bootfs, len) == 0 && (bootfs[len] == '/' || bootfs[len] == '\0')) return (B_TRUE); return (B_FALSE); } /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are * specified as strings. */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; const char *strval; uint64_t intval; const char *check; struct stat64 statbuf; zpool_handle_t *zhp; char *parent, *slash; char report[1024]; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); if (flags.vdevprop && zpool_prop_vdev(propname)) { vdev_prop_t vprop = vdev_name_to_prop(propname); if (vdev_prop_readonly(vprop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, vprop, ZFS_TYPE_VDEV, retprops, &strval, &intval, errbuf) != 0) goto error; continue; } else if (flags.vdevprop && vdev_prop_user(propname)) { if (nvlist_add_nvpair(retprops, elem) != 0) { (void) no_memory(hdl); goto error; } continue; } else if (flags.vdevprop) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property: '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } prop = zpool_name_to_prop(propname); if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { int err; char *fname = strchr(propname, '@') + 1; err = zfeature_lookup_name(fname, NULL); if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "feature '%s' unsupported by kernel"), fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0 && strcmp(strval, ZFS_FEATURE_DISABLED) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'enabled' or 'disabled'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!flags.create && strcmp(strval, ZFS_FEATURE_DISABLED) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'disabled' at creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_uint64(retprops, propname, 0) != 0) { (void) no_memory(hdl); goto error; } continue; } else if (prop == ZPOOL_PROP_INVAL && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strlen(strval) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property value '%s' is too long"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_string(retprops, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Make sure this property is valid and applies to this type. */ if (prop == ZPOOL_PROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zpool_prop_readonly(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (!flags.create && zpool_prop_setonce(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, &strval, &intval, errbuf) != 0) goto error; /* * Perform additional checking for specific properties. */ switch (prop) { case ZPOOL_PROP_VERSION: if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid."), propname, (unsigned long long)intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } break; case ZPOOL_PROP_ASHIFT: if (intval != 0 && (intval < ASHIFT_MIN || intval > ASHIFT_MAX)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid, " "only values between %" PRId32 " and %" PRId32 " are allowed."), propname, (unsigned long long)intval, ASHIFT_MIN, ASHIFT_MAX); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_BOOTFS: if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (version < SPA_VERSION_BOOTFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support " "'%s' property"), propname); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. */ if (!bootfs_name_valid(poolname, strval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is an invalid name"), strval); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto error; } if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad alternate root '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } break; case ZPOOL_PROP_CACHEFILE: if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be empty, an " "absolute path, or 'none'"), propname); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } parent = strdup(strval); if (parent == NULL) { (void) zfs_error(hdl, EZFS_NOMEM, errbuf); goto error; } slash = strrchr(parent, '/'); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid file"), parent); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); free(parent); goto error; } *slash = '\0'; if (parent[0] != '\0' && (stat64(parent, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid directory"), parent); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); free(parent); goto error; } free(parent); break; case ZPOOL_PROP_COMPATIBILITY: switch (zpool_load_compat(strval, NULL, report, 1024)) { case ZPOOL_COMPATIBILITY_OK: case ZPOOL_COMPATIBILITY_WARNTOKEN: break; case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: case ZPOOL_COMPATIBILITY_NOFILES: zfs_error_aux(hdl, "%s", report); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment may only have printable " "characters")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (strlen(strval) > ZPROP_MAX_COMMENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment must not exceed %d characters"), ZPROP_MAX_COMMENT); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_MULTIHOST: if (get_system_hostid() == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "requires a non-zero system hostid")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_DEDUPDITTO: printf("Note: property '%s' no longer has " "any effect\n", propname); break; default: break; } } return (retprops); error: nvlist_free(retprops); return (NULL); } /* * Set zpool property : propname=propval. */ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = {"\0"}; int ret = -1; char errbuf[ERRBUFLEN]; nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_add_string(nvl, propname, propval) != 0) { nvlist_free(nvl); return (no_memory(zhp->zpool_hdl)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } nvlist_free(nvl); nvl = realprops; /* * Execute the corresponding ioctl() to set this property. */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl); ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); zcmd_free_nvlists(&zc); nvlist_free(nvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); else (void) zpool_props_refresh(zhp); return (ret); } int zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, zfs_type_t type, boolean_t literal) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; nvlist_t *features = NULL; nvpair_t *nvp; zprop_list_t **last; boolean_t firstexpand = (NULL == *plp); int i; if (zprop_expand_list(hdl, plp, type) != 0) return (-1); if (type == ZFS_TYPE_VDEV) return (0); last = plp; while (*last != NULL) last = &(*last)->pl_next; if ((*plp)->pl_all) features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { /* Handle userprops in the all properties case */ if (zhp->zpool_props == NULL && zpool_props_refresh(zhp)) return (-1); nvp = NULL; while ((nvp = nvlist_next_nvpair(zhp->zpool_props, nvp)) != NULL) { const char *propname = nvpair_name(nvp); if (!zfs_prop_user(propname)) continue; entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_strdup(hdl, propname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (i = 0; i < SPA_FEATURES; i++) { entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } /* add any unsupported features */ for (nvp = nvlist_next_nvpair(features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; if (zfeature_is_supported(nvpair_name(nvp))) continue; propname = zfs_asprintf(hdl, "unsupported@%s", nvpair_name(nvp)); /* * Before adding the property to the list make sure that no * other pool already added the same property. */ found = B_FALSE; entry = *plp; while (entry != NULL) { if (entry->pl_user_prop != NULL && strcmp(propname, entry->pl_user_prop) == 0) { found = B_TRUE; break; } entry = entry->pl_next; } if (found) { free(propname); continue; } entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = propname; entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } else if (entry->pl_prop == ZPROP_INVAL && zfs_prop_user(entry->pl_user_prop) && zpool_get_userprop(zhp, entry->pl_user_prop, buf, sizeof (buf), NULL) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } } return (0); } int vdev_expand_proplist(zpool_handle_t *zhp, const char *vdevname, zprop_list_t **plp) { zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; const char *strval = NULL; int err = 0; nvpair_t *elem = NULL; nvlist_t *vprops = NULL; nvlist_t *propval = NULL; const char *propname; vdev_prop_t prop; zprop_list_t **last; for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) continue; if (zpool_get_vdev_prop(zhp, vdevname, entry->pl_prop, entry->pl_user_prop, buf, sizeof (buf), NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (entry->pl_prop == VDEV_PROP_NAME && strlen(vdevname) > entry->pl_width) entry->pl_width = strlen(vdevname); } /* Handle the all properties case */ last = plp; if (*last != NULL && (*last)->pl_all == B_TRUE) { while (*last != NULL) last = &(*last)->pl_next; err = zpool_get_all_vdev_props(zhp, vdevname, &vprops); if (err != 0) return (err); while ((elem = nvlist_next_nvpair(vprops, elem)) != NULL) { propname = nvpair_name(elem); /* Skip properties that are not user defined */ if ((prop = vdev_name_to_prop(propname)) != VDEV_PROP_USERPROP) continue; if (nvpair_value_nvlist(elem, &propval) != 0) continue; strval = fnvlist_lookup_string(propval, ZPROP_VALUE); entry = zfs_alloc(zhp->zpool_hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_user_prop = zfs_strdup(zhp->zpool_hdl, propname); entry->pl_width = strlen(strval); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } return (0); } /* * Get the state for the given feature on the given ZFS pool. */ int zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, size_t len) { uint64_t refcount; boolean_t found = B_FALSE; nvlist_t *features = zpool_get_features(zhp); boolean_t supported; const char *feature = strchr(propname, '@') + 1; supported = zpool_prop_feature(propname); ASSERT(supported || zpool_prop_unsupported(propname)); /* * Convert from feature name to feature guid. This conversion is * unnecessary for unsupported@... properties because they already * use guids. */ if (supported) { int ret; spa_feature_t fid; ret = zfeature_lookup_name(feature, &fid); if (ret != 0) { (void) strlcpy(buf, "-", len); return (ENOTSUP); } feature = spa_feature_table[fid].fi_guid; } if (nvlist_lookup_uint64(features, feature, &refcount) == 0) found = B_TRUE; if (supported) { if (!found) { (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); } else { if (refcount == 0) (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); else (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); } } else { if (found) { if (refcount == 0) { (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); } else { (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); } } else { (void) strlcpy(buf, "-", len); return (ENOTSUP); } } return (0); } /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ boolean_t zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) { namecheck_err_t why; char what; int ret; ret = pool_namecheck(pool, &why, &what); /* * The rules for reserved pool names were extended at a later point. * But we need to support users with existing pools that may now be * invalid. So we only check for this expanded set of names during a * create (or import), and only in userland. */ if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); return (B_FALSE); } if (ret != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in pool name"), what); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name must begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool name is reserved")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NO_AT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "permission set is missing '@'")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (B_FALSE); } return (B_TRUE); } /* * Open a handle to the given pool, even if the pool is currently in the FAULTED * state. */ zpool_handle_t * zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; boolean_t missing; /* * Make sure the pool name is valid. */ if (!zpool_name_valid(hdl, B_TRUE, pool)) { (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); return (NULL); } zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (NULL); } if (missing) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); (void) zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); zpool_close(zhp); return (NULL); } return (zhp); } /* * Like the above, but silent on error. Used when iterating over pools (because * the configuration cache may be out of date). */ int zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) { zpool_handle_t *zhp; boolean_t missing; zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (-1); } if (missing) { zpool_close(zhp); *ret = NULL; return (0); } *ret = zhp; return (0); } /* * Similar to zpool_open_canfail(), but refuses to open pools in the faulted * state. */ zpool_handle_t * zpool_open(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) return (NULL); if (zhp->zpool_state == POOL_STATE_UNAVAIL) { (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); zpool_close(zhp); return (NULL); } return (zhp); } /* * Close the handle. Simply frees the memory associated with the handle. */ void zpool_close(zpool_handle_t *zhp) { nvlist_free(zhp->zpool_config); nvlist_free(zhp->zpool_old_config); nvlist_free(zhp->zpool_props); free(zhp); } /* * Return the name of the pool. */ const char * zpool_get_name(zpool_handle_t *zhp) { return (zhp->zpool_name); } /* * Return the state of the pool (ACTIVE or UNAVAILABLE) */ int zpool_get_state(zpool_handle_t *zhp) { return (zhp->zpool_state); } /* * Check if vdev list contains a special vdev */ static boolean_t zpool_has_special_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { const char *bias; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Check if vdev list contains a dRAID vdev */ static boolean_t zpool_has_draid_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { const char *type; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0 && strcmp(type, VDEV_TYPE_DRAID) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Output a dRAID top-level vdev name in to the provided buffer. */ static char * zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, uint64_t spares, uint64_t children) { snprintf(name, len, "%s%llu:%llud:%lluc:%llus", VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, (u_longlong_t)children, (u_longlong_t)spares); return (name); } /* * Return B_TRUE if the provided name is a dRAID spare name. */ boolean_t zpool_is_draid_spare(const char *name) { uint64_t spare_id, parity, vdev_id; if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, (u_longlong_t *)&spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. */ int zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *fsprops) { zfs_cmd_t zc = {"\0"}; nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; nvlist_t *hidden_args = NULL; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; int ret = -1; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), pool); if (!zpool_name_valid(hdl, B_FALSE, pool)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); zcmd_write_conf_nvlist(hdl, &zc, nvroot); if (props) { prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; if ((zc_props = zpool_valid_proplist(hdl, pool, props, SPA_VERSION_1, flags, errbuf)) == NULL) { goto create_failed; } } if (fsprops) { uint64_t zoned; const char *zonestr; zoned = ((nvlist_lookup_string(fsprops, zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, NULL, B_TRUE, errbuf)) == NULL) { goto create_failed; } if (nvlist_exists(zc_fsprops, zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && !zpool_has_special_vdev(nvroot)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s property requires a special vdev"), zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto create_failed; } if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; } if (zfs_crypto_create(hdl, NULL, zc_fsprops, props, B_TRUE, &wkeydata, &wkeylen) != 0) { zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto create_failed; } if (nvlist_add_nvlist(zc_props, ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { goto create_failed; } if (wkeydata != NULL) { if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0) goto create_failed; if (nvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen) != 0) goto create_failed; if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS, hidden_args) != 0) goto create_failed; } } if (zc_props) zcmd_write_src_nvlist(hdl, &zc, zc_props); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. This can also happen under if the device is * part of an active md or lvm device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device, or " "one of\nthe devices is part of an active md or " "lvm device")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ERANGE: /* * This happens if the record size is smaller or larger * than the allowed size range, or not a power of 2. * * NOTE: although zfs_valid_proplist is called earlier, * this case may have slipped through since the * pool does not exist yet and it is therefore * impossible to read properties e.g. max blocksize * from the pool. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "record size invalid")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is less than the " "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); } else { return (zpool_standard_error(hdl, errno, errbuf)); } default: return (zpool_standard_error(hdl, errno, errbuf)); } } create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); return (ret); } /* * Destroy the given pool. It is up to the caller to ensure that there are no * datasets left in the pool. */ int zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zfp = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; if (zhp->zpool_state == POOL_STATE_ACTIVE && (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); if (errno == EROFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } if (zfp) zfs_close(zfp); return (-1); } if (zfp) { remove_mountpoint(zfp); zfs_close(zfp); } return (0); } /* * Create a checkpoint in the given pool. */ int zpool_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot checkpoint '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Discard the checkpoint from the given pool. */ int zpool_discard_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint_discard(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot discard checkpoint in '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Load data type for the given pool. */ int zpool_prefetch(zpool_handle_t *zhp, zpool_prefetch_type_t type) { libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; int error; error = lzc_pool_prefetch(zhp->zpool_name, type); if (error != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot prefetch %s in '%s'"), type == ZPOOL_PREFETCH_DDT ? "ddt" : "", zhp->zpool_name); (void) zpool_standard_error(hdl, error, msg); return (-1); } return (0); } /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. */ int zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift) { zfs_cmd_t zc = {"\0"}; int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add hot spares")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add cache devices")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zcmd_write_conf_nvlist(hdl, &zc, nvroot); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_flags = check_ashift; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; a pool with removing/" "removed vdevs does not support adding " "raidz or dRAID vdevs")); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is less than the minimum " "size (%s)"), buf); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } ret = -1; } else { ret = 0; } zcmd_free_nvlists(&zc); return (ret); } /* * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ static int zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { case EXDEV: zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "use '-f' to override the following errors:\n" "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); return (zfs_error_fmt(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); } } return (0); } int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, nvlist_t *config) { nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr || config == NULL) return; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && ctime_r((time_t *)&rewindto, timestr) != NULL) { timestr[24] = 0; if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " "to its state as of %s.\n"), name, timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "Pool %s returned to its state as of %s.\n"), name, timestr); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", ((longlong_t)loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", (longlong_t)loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } } } void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config, char *buf, size_t size) { nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; struct tm t; char timestr[128], temp[1024]; if (!hdl->libzfs_printerr) return; /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) snprintf(buf, size, dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && ctime_r((time_t *)&rewindto, timestr) != NULL) { timestr[24] = 0; (void) snprintf(temp, 1024, dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), timestr); (void) strlcat(buf, temp, size); } else { (void) strlcat(buf, dgettext(TEXT_DOMAIN, "\tReverting the pool to an earlier state " "should correct the problem.\n\t"), size); } if (loss > 120) { (void) snprintf(temp, 1024, dgettext(TEXT_DOMAIN, "Approximately %lld minutes of data\n" "\tmust be discarded, irreversibly. "), ((longlong_t)loss + 30) / 60); (void) strlcat(buf, temp, size); } else if (loss > 0) { (void) snprintf(temp, 1024, dgettext(TEXT_DOMAIN, "Approximately %lld seconds of data\n" "\tmust be discarded, irreversibly. "), (longlong_t)loss); (void) strlcat(buf, temp, size); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { (void) strlcat(buf, dgettext(TEXT_DOMAIN, "After rewind, at least\n" "\tone persistent user-data error will remain. "), size); } else { (void) strlcat(buf, dgettext(TEXT_DOMAIN, "After rewind, several\n" "\tpersistent user-data errors will remain. "), size); } } (void) snprintf(temp, 1024, dgettext(TEXT_DOMAIN, "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), reason >= 0 ? "clear" : "import", name); (void) strlcat(buf, temp, size); (void) strlcat(buf, dgettext(TEXT_DOMAIN, "A scrub of the pool\n" "\tis strongly recommended after recovery.\n"), size); return; no_info: (void) strlcat(buf, dgettext(TEXT_DOMAIN, "Destroy and re-create the pool from\n\ta backup source.\n"), size); } /* * zpool_import() is a contracted interface. Should be kept the same * if possible. * * Applications should use zpool_import_props() to import a pool with * new properties value to be set. */ int zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, char *altroot) { nvlist_t *props = NULL; int ret; if (altroot != NULL) { if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } if (nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { nvlist_free(props); return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } } ret = zpool_import_props(hdl, config, newname, props, ZFS_IMPORT_NORMAL); nvlist_free(props); return (ret); } static void print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; char *vname; uint64_t is_log = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (name != NULL) (void) printf("\t%*s%s%s\n", indent, "", name, is_log ? " [log]" : ""); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } } void zpool_collect_unsup_feat(nvlist_t *config, char *buf, size_t size) { nvlist_t *nvinfo, *unsup_feat; char temp[512]; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); unsup_feat = fnvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT); for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; nvp = nvlist_next_nvpair(unsup_feat, nvp)) { const char *desc = fnvpair_value_string(nvp); if (strlen(desc) > 0) { (void) snprintf(temp, 512, "\t%s (%s)\n", nvpair_name(nvp), desc); (void) strlcat(buf, temp, size); } else { (void) snprintf(temp, 512, "\t%s\n", nvpair_name(nvp)); (void) strlcat(buf, temp, size); } } } /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from * zpool_find_import(). The 'newname' parameters control whether the pool * is imported with a different name. */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, int flags) { zfs_cmd_t zc = {"\0"}; zpool_load_policy_t policy; nvlist_t *nv = NULL; nvlist_t *nvinfo = NULL; nvlist_t *missing = NULL; const char *thename; const char *origname; int ret; int error = 0; char buf[2048]; char errbuf[ERRBUFLEN]; origname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot import pool '%s'"), origname); if (newname != NULL) { if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); thename = newname; } else { thename = origname; } if (props != NULL) { uint64_t version; prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if ((props = zpool_valid_proplist(hdl, origname, props, version, flags, errbuf)) == NULL) return (-1); zcmd_write_src_nvlist(hdl, &zc, props); nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); zc.zc_guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); zcmd_write_conf_nvlist(hdl, &zc, config); zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2); zc.zc_cookie = flags; while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (ret != 0) error = errno; (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); zcmd_free_nvlists(&zc); zpool_get_load_policy(config, &policy); if (error) { char desc[1024]; char aux[256]; /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, B_TRUE, nv); nvlist_free(nv); return (-1); } if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), thename); else (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); switch (error) { case ENOTSUP: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { (void) printf(dgettext(TEXT_DOMAIN, "This " "pool uses the following feature(s) not " "supported by this system:\n")); memset(buf, 0, 2048); zpool_collect_unsup_feat(nv, buf, 2048); (void) printf("%s", buf); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) { (void) printf(dgettext(TEXT_DOMAIN, "All unsupported features are only " "required for writing to the pool." "\nThe pool can be imported using " "'-o readonly=on'.\n")); } } /* * Unsupported version. */ (void) zfs_error(hdl, EZFS_BADVERSION, desc); break; case EREMOTEIO: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { const char *hostname = ""; uint64_t hostid = 0; mmp_state_t mmp_state; mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); if (mmp_state == MMP_STATE_ACTIVE) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool is imp" "orted on host '%s' (hostid=%lx).\n" "Export the pool on the other " "system, then run 'zpool import'."), hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool has " "the multihost property on and " "the\nsystem's hostid is not set. " "Set a unique system hostid with " "the zgenhostid(8) command.\n")); } (void) zfs_error_aux(hdl, "%s", aux); } (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); break; case EINVAL: (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; case EROFS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENXIO: if (nv && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, "The devices below are missing or " "corrupted, use '-m' to import the pool " "anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } (void) zpool_standard_error(hdl, error, desc); break; case EEXIST: (void) zpool_standard_error(hdl, error, desc); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices are already in use\n")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENAMETOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new name of at least one dataset is longer than " "the maximum allowable length")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); break; default: (void) zpool_standard_error(hdl, error, desc); memset(buf, 0, 2048); zpool_explain_recover(hdl, newname ? origname : thename, -error, nv, buf, 2048); (void) printf("\t%s", buf); break; } nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; /* * This should never fail, but play it safe anyway. */ if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; else if (zhp != NULL) zpool_close(zhp); if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); } nvlist_free(nv); } return (ret); } /* * Translate vdev names to guids. If a vdev_path is determined to be * unsuitable then a vd_errlist is allocated and the vdev path and errno * are added to it. */ static int zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds, nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist) { nvlist_t *errlist = NULL; int error = 0; for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { boolean_t spare, cache; const char *vd_path = nvpair_name(elem); nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); if ((tgt == NULL) || cache || spare) { if (errlist == NULL) { errlist = fnvlist_alloc(); error = EINVAL; } uint64_t err = (tgt == NULL) ? EZFS_NODEVICE : (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); fnvlist_add_int64(errlist, vd_path, err); continue; } uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); fnvlist_add_uint64(vdev_guids, vd_path, guid); char msg[MAXNAMELEN]; (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); fnvlist_add_string(guids_to_paths, msg, vd_path); } if (error != 0) { verify(errlist != NULL); if (vd_errlist != NULL) *vd_errlist = errlist; else fnvlist_free(errlist); } return (error); } static int xlate_init_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_INITIALIZING); case ESRCH: return (EZFS_NO_INITIALIZE); } return (err); } /* * Begin, suspend, cancel, or uninit (clear) the initialization (initializing * of all free blocks) for the given vdevs in the given pool. */ static int zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds, boolean_t wait) { int err; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *vd_errlist = NULL; nvlist_t *errlist; nvpair_t *elem; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &vd_errlist); if (err != 0) { verify(vd_errlist != NULL); goto list_errors; } err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, &errlist); if (err != 0) { if (errlist != NULL && nvlist_lookup_nvlist(errlist, ZPOOL_INITIALIZE_VDEVS, &vd_errlist) == 0) { goto list_errors; } if (err == EINVAL && cmd_type == POOL_INITIALIZE_UNINIT) { zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "uninitialize is not supported by kernel")); } (void) zpool_standard_error(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "operation failed")); goto out; } if (wait) { for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_INITIALIZE, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting for '%s' to initialize"), nvpair_name(elem)); goto out; } } } goto out; list_errors: for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); const char *path; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot initialize '%s'", path); } out: fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); if (vd_errlist != NULL) fnvlist_free(vd_errlist); return (err == 0 ? 0 : -1); } int zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); } int zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE)); } static int xlate_trim_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_TRIMMING); case ESRCH: return (EZFS_NO_TRIM); case EOPNOTSUPP: return (EZFS_TRIM_NOTSUP); } return (err); } static int zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) { int err; nvpair_t *elem; for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_TRIM, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting to trim '%s'"), nvpair_name(elem)); return (err); } } return (0); } /* * Check errlist and report any errors, omitting ones which should be * suppressed. Returns B_TRUE if any errors were reported. */ static boolean_t check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags, nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist) { nvpair_t *elem; boolean_t reported_errs = B_FALSE; int num_vds = 0; int num_suppressed_errs = 0; for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { num_vds++; } for (elem = nvlist_next_nvpair(errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) { int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); const char *path; /* * If only the pool was specified, and it was not a secure * trim then suppress warnings for individual vdevs which * do not support trimming. */ if (vd_error == EZFS_TRIM_NOTSUP && trim_flags->fullpool && !trim_flags->secure) { num_suppressed_errs++; continue; } reported_errs = B_TRUE; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot trim '%s'", path); } if (num_suppressed_errs == num_vds) { (void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "no devices in pool support trim operations")); (void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP, dgettext(TEXT_DOMAIN, "cannot trim"))); reported_errs = B_TRUE; } return (reported_errs); } /* * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for * the given vdevs in the given pool. */ int zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, trimflags_t *trim_flags) { int err; int retval = 0; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *errlist = NULL; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &errlist); if (err != 0) { check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist); retval = -1; goto out; } err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, trim_flags->secure, vdev_guids, &errlist); if (err != 0) { nvlist_t *vd_errlist; if (errlist != NULL && nvlist_lookup_nvlist(errlist, ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) { if (check_trim_errs(zhp, trim_flags, guids_to_paths, vds, vd_errlist)) { retval = -1; goto out; } } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "operation failed")); zpool_standard_error(zhp->zpool_hdl, err, errbuf); retval = -1; goto out; } } if (trim_flags->wait) retval = zpool_trim_wait(zhp, vdev_guids); out: if (errlist != NULL) fnvlist_free(errlist); fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); return (retval); } /* * Scan the pool. */ int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { char errbuf[ERRBUFLEN]; int err; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, "scan_type", (uint64_t)func); fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); fnvlist_free(args); if (err == 0) { return (0); } else if (err == ZFS_ERR_IOC_CMD_UNAVAIL) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = func; zc.zc_flags = cmd; if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) return (0); } /* * An ECANCELED on a scrub means one of the following: * 1. we resumed a paused scrub. * 2. we resumed a paused error scrub. * 3. Error scrub is not run because of no error log. */ if (err == ECANCELED && (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL) return (0); /* * The following cases have been handled here: * 1. Paused a scrub/error scrub if there is none in progress. */ if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_PAUSE) { return (0); } ASSERT3U(func, >=, POOL_SCAN_NONE); ASSERT3U(func, <, POOL_SCAN_FUNCS); if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), zhp->zpool_name); } else { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot scrub %s"), zhp->zpool_name); } } else if (func == POOL_SCAN_RESILVER) { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot restart resilver on %s"), zhp->zpool_name); } else if (func == POOL_SCAN_NONE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), zhp->zpool_name); } else { assert(!"unexpected result"); } /* * With EBUSY, five cases are possible: * * Current state Requested * 1. Normal Scrub Running Normal Scrub or Error Scrub * 2. Normal Scrub Paused Error Scrub * 3. Normal Scrub Paused Pause Normal Scrub * 4. Error Scrub Running Normal Scrub or Error Scrub * 5. Error Scrub Paused Pause Error Scrub * 6. Resilvering Anything else */ if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; uint_t psc; nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB && ps->pss_state == DSS_SCANNING) { if (ps->pss_pass_scrub_pause == 0) { /* handles case 1 */ assert(cmd == POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); } else { if (func == POOL_SCAN_ERRORSCRUB) { /* handles case 2 */ ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_SCRUB_PAUSED_TO_CANCEL, errbuf)); } else { /* handles case 3 */ ASSERT3U(func, ==, POOL_SCAN_SCRUB); ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); return (zfs_error(hdl, EZFS_SCRUB_PAUSED, errbuf)); } } } else if (ps && ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB && ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) { if (ps->pss_pass_error_scrub_pause == 0) { /* handles case 4 */ ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_ERRORSCRUBBING, errbuf)); } else { /* handles case 5 */ ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB); ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED, errbuf)); } } else { /* handles case 6 */ return (zfs_error(hdl, EZFS_RESILVERING, errbuf)); } } else if (err == ENOENT) { return (zfs_error(hdl, EZFS_NO_SCRUB, errbuf)); } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) { return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, errbuf)); } else { return (zpool_standard_error(hdl, err, errbuf)); } } /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. * * If 'return_parent' is set, then return the *parent* of the vdev you're * searching for rather than the vdev itself. */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log, boolean_t return_parent) { uint_t c, children; nvlist_t **child; nvlist_t *ret; uint64_t is_log; const char *srchkey; nvpair_t *pair = nvlist_next_nvpair(search, NULL); const char *tmp = NULL; boolean_t is_root; /* Nothing to look for */ if (search == NULL || pair == NULL) return (NULL); /* Obtain the key we will use to search */ srchkey = nvpair_name(pair); nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &tmp); if (strcmp(tmp, "root") == 0) is_root = B_TRUE; else is_root = B_FALSE; switch (nvpair_type(pair)) { case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { uint64_t srchval = fnvpair_value_uint64(pair); uint64_t theguid = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID); if (theguid == srchval) return (nv); } break; case DATA_TYPE_STRING: { const char *srchval, *val; srchval = fnvpair_value_string(pair); if (nvlist_lookup_string(nv, srchkey, &val) != 0) break; /* * Search for the requested value. Special cases: * * - ZPOOL_CONFIG_PATH for whole disk entries. These end in * "-part1", or "p1". The suffix is hidden from the user, * but included in the string, so this matches around it. * - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname() * is used to check all possible expanded paths. * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). * * Otherwise, all other searches are simple string compares. */ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0) return (nv); } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0) { char *type, *idx, *end, *p; uint64_t id, vdev_id; /* * Determine our vdev type, keeping in mind * that the srchval is composed of a type and * vdev id pair (i.e. mirror-4). */ if ((type = strdup(srchval)) == NULL) return (NULL); if ((p = strrchr(type, '-')) == NULL) { free(type); break; } idx = p + 1; *p = '\0'; /* * If the types don't match then keep looking. */ if (strncmp(val, type, strlen(val)) != 0) { free(type); break; } verify(zpool_vdev_is_interior(type)); id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); errno = 0; vdev_id = strtoull(idx, &end, 10); /* * If we are looking for a raidz and a parity is * specified, make sure it matches. */ int rzlen = strlen(VDEV_TYPE_RAIDZ); assert(rzlen == strlen(VDEV_TYPE_DRAID)); int typlen = strlen(type); if ((strncmp(type, VDEV_TYPE_RAIDZ, rzlen) == 0 || strncmp(type, VDEV_TYPE_DRAID, rzlen) == 0) && typlen != rzlen) { uint64_t vdev_parity; int parity = *(type + rzlen) - '0'; if (parity <= 0 || parity > 3 || (typlen - rzlen) != 1) { /* * Nonsense parity specified, can * never match */ free(type); return (NULL); } vdev_parity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); if ((int)vdev_parity != parity) { free(type); break; } } free(type); if (errno != 0) return (NULL); /* * Now verify that we have the correct vdev id. */ if (vdev_id == id) return (nv); } /* * Common case */ if (strcmp(srchval, val) == 0) return (nv); break; } default: break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL, return_parent)) != NULL) { /* * The 'is_log' value is only set for the toplevel * vdev, not the leaf vdevs. So we always lookup the * log device from the root of the vdev tree (where * 'log' is non-NULL). */ if (log != NULL && nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && is_log) { *log = B_TRUE; } return (ret && return_parent && !is_root ? nv : ret); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL, return_parent)) != NULL) { *avail_spare = B_TRUE; return (ret && return_parent && !is_root ? nv : ret); } } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL, return_parent)) != NULL) { *l2cache = B_TRUE; return (ret && return_parent && !is_root ? nv : ret); } } } return (NULL); } /* * Given a physical path or guid, find the associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; uint64_t guid; char *end; search = fnvlist_alloc(); guid = strtoull(ppath, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log, B_FALSE); fnvlist_free(search); return (ret); } /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ static boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_ROOT, strlen(VDEV_TYPE_ROOT)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && !zpool_is_draid_spare(name)) return (B_TRUE); return (B_FALSE); } /* * Lookup the nvlist for a given vdev or vdev's parent (depending on * if 'return_parent' is set). */ static nvlist_t * __zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log, boolean_t return_parent) { char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; boolean_t __avail_spare, __l2cache, __log; search = fnvlist_alloc(); guid = strtoull(path, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else if (zpool_vdev_is_interior(path)) { fnvlist_add_string(search, ZPOOL_CONFIG_TYPE, path); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PATH, path); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); /* * User can pass NULL for avail_spare, l2cache, and log, but * we still need to provide variables to vdev_to_nvlist_iter(), so * just point them to junk variables here. */ if (!avail_spare) avail_spare = &__avail_spare; if (!l2cache) l2cache = &__l2cache; if (!log) log = &__log; *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log, return_parent); fnvlist_free(search); return (ret); } nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { return (__zpool_find_vdev(zhp, path, avail_spare, l2cache, log, B_FALSE)); } /* Given a vdev path, return its parent's nvlist */ nvlist_t * zpool_find_parent_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { return (__zpool_find_vdev(zhp, path, avail_spare, l2cache, log, B_TRUE)); } /* * Convert a vdev path to a GUID. Returns GUID or 0 on error. * * If is_spare, is_l2cache, or is_log is non-NULL, then store within it * if the VDEV is a spare, l2cache, or log device. If they're NULL then * ignore them. */ static uint64_t zpool_vdev_path_to_guid_impl(zpool_handle_t *zhp, const char *path, boolean_t *is_spare, boolean_t *is_l2cache, boolean_t *is_log) { boolean_t spare = B_FALSE, l2cache = B_FALSE, log = B_FALSE; nvlist_t *tgt; if ((tgt = zpool_find_vdev(zhp, path, &spare, &l2cache, &log)) == NULL) return (0); if (is_spare != NULL) *is_spare = spare; if (is_l2cache != NULL) *is_l2cache = l2cache; if (is_log != NULL) *is_log = log; return (fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID)); } /* Convert a vdev path to a GUID. Returns GUID or 0 on error. */ uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path) { return (zpool_vdev_path_to_guid_impl(zhp, path, NULL, NULL, NULL)); } /* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ int zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, vdev_state_t *newstate) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; if (flags & ZFS_ONLINE_EXPAND) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot expand %s"), path); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot online %s"), path); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (!(flags & ZFS_ONLINE_SPARE) && avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); #ifndef __FreeBSD__ const char *pathname; if ((flags & ZFS_ONLINE_EXPAND || zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * XXX - L2ARC 1.0 devices can't support expansion. */ if (l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot expand cache devices")); return (zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf)); } if (wholedisk) { const char *fullpath = path; char buf[MAXPATHLEN]; int error; if (path[0] != '/') { error = zfs_resolve_shortname(path, buf, sizeof (buf)); if (error != 0) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); fullpath = buf; } error = zpool_relabel_disk(hdl, fullpath, errbuf); if (error != 0) return (error); } } #endif zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " "instead"), "zpool detach"); return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, errbuf)); } return (zpool_standard_error(hdl, errno, errbuf)); } *newstate = zc.zc_cookie; return (0); } /* * Take the specified vdev offline */ int zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); case EEXIST: /* * The log device has unplayed logs */ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Remove the specified vdev asynchronously from the configuration, so * that it may come ONLINE if reinserted. This is called from zed on * Udev remove event. * Note: We also have a similar function zpool_vdev_remove() that * removes the vdev from the pool. */ int zpool_vdev_remove_wanted(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); zc.zc_cookie = VDEV_STATE_REMOVED; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Mark the given vdev faulted. */ int zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Generic set vdev state function */ static int zpool_vdev_set_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux, vdev_state_t state) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set %s %llu"), zpool_state_to_name(state, aux), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = state; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Mark the given vdev degraded. */ int zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_DEGRADED)); } /* * Mark the given vdev as in a removed state (as if the device does not exist). * * This is different than zpool_vdev_remove() which does a removal of a device * from the pool (but the device does exist). */ int zpool_vdev_set_removed_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_REMOVED)); } /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. */ static boolean_t is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { const char *type = fnvlist_lookup_string(search, ZPOOL_CONFIG_TYPE); if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); for (c = 0; c < children; c++) if (is_replacing_spare(child[c], tgt, which)) return (B_TRUE); } return (B_FALSE); } /* * Attach new_disk (fully described by nvroot) to old_disk. * If 'replacing' is specified, the new disk will replace the old one. */ int zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; int ret; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; const char *type; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; if (replacing) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot replace %s with %s"), old_disk, new_disk); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); zc.zc_cookie = replacing; zc.zc_simple = rebuild; if (rebuild && zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs module doesn't support device rebuilds")); return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } type = fnvlist_lookup_string(tgt, ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 && zfeature_lookup_guid("org.openzfs:raidz_expansion", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs module doesn't support raidz expansion")); return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); return (zfs_error(hdl, EZFS_INVALCONFIG, errbuf)); } config_root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) return (-1); /* * If the target is a hot spare that has been swapped in, we can only * replace it with another hot spare. */ if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); free(newname); return (zfs_error(hdl, EZFS_BADTARGET, errbuf)); } free(newname); zcmd_write_conf_nvlist(hdl, &zc, nvroot); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't attach to or replace this type of vdev. */ if (replacing) { uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only mirror and dRAID vdevs support " "sequential reconstruction")); } else if (zpool_is_draid_spare(new_disk)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares can only replace child " "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } } else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "raidz_expansion feature must be enabled " "in order to attach a device to raidz")); } else { char status[64] = {0}; zpool_prop_get_feature(zhp, "feature@device_rebuild", status, 63); if (rebuild && strncmp(status, ZFS_FEATURE_DISABLED, 64) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device_rebuild feature must be enabled " "in order to use sequential " "reconstruction")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " "disks")); } } (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EINVAL: /* * The new device must be a single disk. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * The new device is too small. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is too small")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EDOM: /* * The new device has a different optimal sector size. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device has a different optimal sector size; use the " "option '-o ashift=N' to override the optimal size")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. */ (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); break; case ENXIO: /* * The existing raidz vdev has offline children */ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "raidz vdev has devices that are are offline or " "being replaced")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; case EADDRINUSE: /* * The boot reserved area is already being used (FreeBSD) */ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the reserved boot area needed for the expansion " "is already being used by a boot loader")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; case ZFS_ERR_ASHIFT_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "The new device cannot have a higher alignment requirement " "than the top-level vdev.")); (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Detach the specified device. */ int zpool_vdev_detach(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't detach from this type of vdev. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EBUSY: /* * There are no other replicas of this device. */ (void) zfs_error(hdl, EZFS_NOREPLICAS, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Find a mirror vdev in the source nvlist. * * The mchild array contains a list of disks in one of the top-level mirrors * of the source pool. The schild array contains a list of disks that the * user specified on the command line. We loop over the mchild array to * see if any entry in the schild array matches. * * If a disk in the mchild array is found in the schild array, we return * the index of that entry. Otherwise we return -1. */ static int find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, nvlist_t **schild, uint_t schildren) { uint_t mc; for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, mchild[mc], 0); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, schild[sc], 0); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); if (result) { free(mpath); return (mc); } } free(mpath); } return (-1); } /* * Split a mirror pool. If newroot points to null, then a new nvlist * is generated and it is the responsibility of the caller to free it. */ int zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; const char *bias; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t vers, readonly = B_FALSE; boolean_t freelist = B_FALSE, memory_err = B_TRUE; int retval = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("Internal error: unable to " "retrieve pool configuration\n")); return (-1); } tree = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); vers = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (props) { prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, props, vers, flags, errbuf)) == NULL) return (-1); (void) nvlist_lookup_uint64(zc_props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property %s can only be set at import time"), zpool_prop_to_name(ZPOOL_PROP_READONLY)); return (-1); } } if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); nvlist_free(zc_props); return (-1); } varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); vcount = 0; if (*newroot == NULL || nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, &newchild, &newchildren) != 0) newchildren = 0; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; boolean_t is_special = B_FALSE, is_dedup = B_FALSE; const char *type; nvlist_t **mchild, *vdev; uint_t mchildren; int entry; /* * Unlike cache & spares, slogs are stored in the * ZPOOL_CONFIG_CHILDREN array. We filter them out here. */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_log || is_hole) { /* * Create a hole vdev and put it in the config. */ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0) goto out; if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, 1) != 0) goto out; if (lastlog == 0) lastlog = vcount; varray[vcount++] = vdev; continue; } lastlog = 0; type = fnvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { vdev = child[c]; if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; continue; } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) is_special = B_TRUE; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) is_dedup = B_TRUE; } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); /* find or add an entry for this top-level vdev */ if (newchildren > 0 && (entry = find_vdev_entry(zhp, mchild, mchildren, newchild, newchildren)) >= 0) { /* We found a disk that the user specified. */ vdev = mchild[entry]; ++found; } else { /* User didn't specify a disk for this vdev. */ vdev = mchild[mchildren - 1]; } if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; if (flags.dryrun != 0) { if (is_dedup == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) != 0) goto out; } else if (is_special == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) != 0) goto out; } } } /* did we find every disk the user specified? */ if (found != newchildren) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " "include at most one disk from each mirror")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } /* Prepare the nvlist for populating. */ if (*newroot == NULL) { if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) goto out; freelist = B_TRUE; if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0) goto out; } else { verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); } /* Add all the children we found */ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)varray, lastlog == 0 ? vcount : lastlog) != 0) goto out; /* * If we're just doing a dry run, exit now with success. */ if (flags.dryrun) { memory_err = B_FALSE; freelist = B_FALSE; goto out; } /* now build up the config list & call the ioctl */ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || nvlist_add_string(newconfig, ZPOOL_CONFIG_POOL_NAME, newname) != 0 || nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) goto out; /* * The new pool is automatically part of the namespace unless we * explicitly export it. */ if (!flags.import) zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); zcmd_write_conf_nvlist(hdl, &zc, newconfig); if (zc_props != NULL) zcmd_write_src_nvlist(hdl, &zc, zc_props); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { retval = zpool_standard_error(hdl, errno, errbuf); goto out; } freelist = B_FALSE; memory_err = B_FALSE; out: if (varray != NULL) { int v; for (v = 0; v < vcount; v++) nvlist_free(varray[v]); free(varray); } zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; } if (retval != 0) return (retval); if (memory_err) return (no_memory(hdl)); return (0); } /* * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t version; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); if (zpool_is_draid_spare(path)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares cannot be removed")); return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); switch (errno) { case EALREADY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "removal for this vdev is already in progress.")); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " "have the same sector size and not be raidz.")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Pool busy; removal may already be in progress")); } (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; case EACCES: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); (void) zfs_error(hdl, EZFS_BUSY, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } int zpool_vdev_remove_cancel(zpool_handle_t *zhp) { zfs_cmd_t zc = {{0}}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel removal")); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } int zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, uint64_t *sizep) { char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), path); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare || l2cache || islog) { *sizep = 0; return (0); } if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "indirect size not available")); return (zfs_error(hdl, EINVAL, errbuf)); } return (0); } /* * Clear the errors for the pool, or the particular device if specified. */ int zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; zpool_load_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; int error; if (path) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), path); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); /* * Don't allow error clearing for hot spares. Do allow * error clearing for l2cache devices. */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); } zpool_get_load_policy(rewindnvl, &policy); zc.zc_cookie = policy.zlp_rewind; zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2); zcmd_write_src_nvlist(hdl, &zc, rewindnvl); while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); zpool_rewind_exclaim(hdl, zc.zc_name, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nvi); nvlist_free(nvi); } zcmd_free_nvlists(&zc); return (0); } zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Similar to zpool_clear(), but takes a GUID (used by fmd). */ int zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Change the GUID for a pool. * * Similar to zpool_reguid(), but may take a GUID. * * If the guid argument is NULL, then no GUID is passed in the nvlist to the * ioctl(). */ int zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid) { char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvl = NULL; zfs_cmd_t zc = {"\0"}; int error = -1; if (guid != NULL) { if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(hdl)); if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) { nvlist_free(nvl); return (no_memory(hdl)); } zcmd_write_src_nvlist(hdl, &zc, nvl); } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc); if (error) { return (zpool_standard_error(hdl, errno, errbuf)); } if (guid != NULL) { zcmd_free_nvlists(&zc); nvlist_free(nvl); } return (0); } /* * Change the GUID for a pool. */ int zpool_reguid(zpool_handle_t *zhp) { return (zpool_set_guid(zhp, NULL)); } /* * Reopen the pool. */ int zpool_reopen_one(zpool_handle_t *zhp, void *data) { libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *scrub_restart = data; int error; error = lzc_reopen(pool_name, *scrub_restart); if (error) { return (zpool_standard_error_fmt(hdl, error, dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), pool_name)); } return (0); } /* call into libzfs_core to execute the sync IOCTL per pool */ int zpool_sync_one(zpool_handle_t *zhp, void *data) { int ret; libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *force = data; nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_boolean_value(innvl, "force", *force); if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) { nvlist_free(innvl); return (zpool_standard_error_fmt(hdl, ret, dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name)); } nvlist_free(innvl); return (0); } #define PATH_BUF_LEN 64 /* * Given a vdev, return the name to display in iostat. If the vdev has a path, * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. * We also check if this is a whole disk, in which case we strip off the * trailing 's0' slice name. * * This routine is also responsible for identifying when disks have been * reconfigured in a new location. The kernel will have opened the device by * devid, but the path will still refer to the old location. To catch this, we * first do a path -> devid translation (which is fast for the common case). If * the devid matches, we're done. If not, we do a reverse devid -> path * translation and issue the appropriate ioctl() to update the path of the vdev. * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any * of these checks. */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, int name_flags) { const char *type, *tpath; const char *path; uint64_t value; char buf[PATH_BUF_LEN]; char tmpbuf[PATH_BUF_LEN * 2]; /* * vdev_name will be "root"/"root-0" for the root vdev, but it is the * zpool name that will be displayed to the user. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) return (zfs_strdup(hdl, zpool_get_name(zhp))); if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_PATH")) name_flags |= VDEV_NAME_PATH; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_GUID")) name_flags |= VDEV_NAME_GUID; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_FOLLOW_LINKS")) name_flags |= VDEV_NAME_FOLLOW_LINKS; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || name_flags & VDEV_NAME_GUID) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tpath) == 0) { path = tpath; if (name_flags & VDEV_NAME_FOLLOW_LINKS) { char *rp = realpath(path, NULL); if (rp) { strlcpy(buf, rp, sizeof (buf)); path = buf; free(rp); } } /* * For a block device only use the name. */ if ((strcmp(type, VDEV_TYPE_DISK) == 0) && !(name_flags & VDEV_NAME_PATH)) { path = zfs_strip_path(path); } /* * Remove the partition from the path if this is a whole disk. */ if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } } else { path = type; /* * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { value = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); path = buf; } /* * If it's a dRAID device, we add parity, groups, and spares. */ if (strcmp(path, VDEV_TYPE_DRAID) == 0) { uint64_t ndata, nparity, nspares; nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); nparity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); ndata = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA); nspares = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES); path = zpool_draid_name(buf, sizeof (buf), ndata, nparity, nspares, children); } /* * We identify each top-level vdev by using a * naming convention. */ if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", path, (u_longlong_t)id); path = tmpbuf; } } return (zfs_strdup(hdl, path)); } static int zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } void zpool_add_propname(zpool_handle_t *zhp, const char *propname) { assert(zhp->zpool_n_propnames < ZHP_MAX_PROPNAMES); zhp->zpool_propnames[zhp->zpool_n_propnames] = propname; zhp->zpool_n_propnames++; } /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. */ int zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; zbookmark_phys_t *buf; uint64_t buflen = 10000; /* approx. 1MB of RAM */ if (fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT) == 0) return (0); /* * Retrieve the raw error list from the kernel. If it doesn't fit, * allocate a larger buffer and retry. */ (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { buf = zfs_alloc(zhp->zpool_hdl, buflen * sizeof (zbookmark_phys_t)); zc.zc_nvlist_dst = (uintptr_t)buf; zc.zc_nvlist_dst_size = buflen; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, &zc) != 0) { free(buf); if (errno == ENOMEM) { buflen *= 2; } else { return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "errors: List of " "errors unavailable"))); } } else { break; } } /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last * to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size; uint64_t zblen = buflen - zc.zc_nvlist_dst_size; qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ for (uint64_t i = 0; i < zblen; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && zb[i-1].zb_object == zb[i].zb_object) continue; if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) goto nomem; if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb[i].zb_objset) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb[i].zb_object) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { nvlist_free(nv); goto nomem; } nvlist_free(nv); } free(buf); return (0); nomem: free(buf); return (no_memory(zhp->zpool_hdl)); } /* * Upgrade a ZFS pool to the latest on-disk version. */ int zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strcpy(zc.zc_name, zhp->zpool_name); zc.zc_cookie = new_version; if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), zhp->zpool_name)); return (0); } void zfs_save_arguments(int argc, char **argv, char *string, int len) { int i; (void) strlcpy(string, zfs_basename(argv[0]), len); for (i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); } } int zpool_log_history(libzfs_handle_t *hdl, const char *message) { zfs_cmd_t zc = {"\0"}; nvlist_t *args; args = fnvlist_alloc(); fnvlist_add_string(args, "message", message); zcmd_write_src_nvlist(hdl, &zc, args); int err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); } /* * Perform ioctl to get some command history of a pool. * * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the * logical offset of the history buffer to start reading from. * * Upon return, 'off' is the next logical offset to read from and * 'len' is the actual amount of bytes read into 'buf'. */ static int get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)buf; zc.zc_history_len = *len; zc.zc_history_offset = *off; if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, dgettext(TEXT_DOMAIN, "cannot show history for pool '%s'"), zhp->zpool_name)); case ENOENT: return (zfs_error_fmt(hdl, EZFS_NOHISTORY, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s'"), zhp->zpool_name)); case ENOTSUP: return (zfs_error_fmt(hdl, EZFS_BADVERSION, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s', pool must be upgraded"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name)); } } *len = zc.zc_history_len; *off = zc.zc_history_offset; return (0); } /* * Retrieve the command history of a pool. */ int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, boolean_t *eof) { libzfs_handle_t *hdl = zhp->zpool_hdl; char *buf; int buflen = 128 * 1024; nvlist_t **records = NULL; uint_t numrecords = 0; int err = 0, i; uint64_t start = *off; buf = zfs_alloc(hdl, buflen); /* process about 1MiB a time */ while (*off - start < 1024 * 1024) { uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ if (!bytes_read) { *eof = B_TRUE; break; } if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) { zpool_standard_error_fmt(hdl, err, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name); break; } *off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough * to hold this record; resize and retry. */ buflen *= 2; free(buf); buf = zfs_alloc(hdl, buflen); } } free(buf); if (!err) { *nvhisp = fnvlist_alloc(); fnvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, (const nvlist_t **)records, numrecords); } for (i = 0; i < numrecords; i++) nvlist_free(records[i]); free(records); return (err); } /* * Retrieve the next event given the passed 'zevent_fd' file descriptor. * If there is a new event available 'nvp' will contain a newly allocated * nvlist and 'dropped' will be set to the number of missed events since * the last call to this function. When 'nvp' is set to NULL it indicates * no new events are available. In either case the function returns 0 and * it is up to the caller to free 'nvp'. In the case of a fatal error the * function will return a non-zero value. When the function is called in * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), * it will not return until a new event is available. */ int zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, int *dropped, unsigned flags, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; *nvp = NULL; *dropped = 0; zc.zc_cleanup_fd = zevent_fd; if (flags & ZEVENT_NONBLOCK) zc.zc_guid = ZEVENT_NONBLOCK; zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE); retry: if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) { switch (errno) { case ESHUTDOWN: error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "zfs shutdown")); goto out; case ENOENT: /* Blocking error case should not occur */ if (!(flags & ZEVENT_NONBLOCK)) error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); goto retry; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; } } error = zcmd_read_dst_nvlist(hdl, &zc, nvp); if (error != 0) goto out; *dropped = (int)zc.zc_cookie; out: zcmd_free_nvlists(&zc); return (error); } /* * Clear all events. */ int zpool_events_clear(libzfs_handle_t *hdl, int *count) { zfs_cmd_t zc = {"\0"}; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) return (zpool_standard_error(hdl, errno, dgettext(TEXT_DOMAIN, "cannot clear events"))); if (count != NULL) *count = (int)zc.zc_cookie; /* # of events cleared */ return (0); } /* * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for * the passed zevent_fd file handle. On success zero is returned, * otherwise -1 is returned and hdl->libzfs_error is set to the errno. */ int zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; zc.zc_guid = eid; zc.zc_cleanup_fd = zevent_fd; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { switch (errno) { case ENOENT: error = zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot get event")); break; case ENOMEM: error = zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot get event")); break; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); break; } } return (error); } static void zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len, boolean_t always_unmounted) { zfs_cmd_t zc = {"\0"}; boolean_t mounted = B_FALSE; char *mntpnt = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; if (dsobj == 0) { /* special case for the MOS */ (void) snprintf(pathname, len, ":<0x%llx>", (longlong_t)obj); return; } /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", (longlong_t)dsobj, (longlong_t)obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname, &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, zc.zc_value); } else { (void) snprintf(pathname, len, "%s:%s", dsname, zc.zc_value); } } else { (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); } free(mntpnt); } void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); } void zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE); } /* * Wait while the specified activity is in progress in the pool. */ int zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity) { boolean_t missing; int error = zpool_wait_status(zhp, activity, &missing, NULL); if (missing) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); return (ENOENT); } else { return (error); } } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent pools are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zpool's wait cmd). In that * scenario, a pool being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the pool doesn't * exist. */ int zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait(zhp->zpool_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) { int error = lzc_set_bootenv(zhp->zpool_name, envmap); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error setting bootenv in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp) { nvlist_t *nvl; int error; nvl = NULL; error = lzc_get_bootenv(zhp->zpool_name, &nvl); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error getting bootenv in pool '%s'"), zhp->zpool_name); } else { *nvlp = nvl; } return (error); } /* * Attempt to read and parse feature file(s) (from "compatibility" property). * Files contain zpool feature names, comma or whitespace-separated. * Comments (# character to next newline) are discarded. * * Arguments: * compatibility : string containing feature filenames * features : either NULL or pointer to array of boolean * report : either NULL or pointer to string buffer * rlen : length of "report" buffer * * compatibility is NULL (unset), "", "off", "legacy", or list of * comma-separated filenames. filenames should either be absolute, * or relative to: * 1) ZPOOL_SYSCONF_COMPAT_D (eg: /etc/zfs/compatibility.d) or * 2) ZPOOL_DATA_COMPAT_D (eg: /usr/share/zfs/compatibility.d). * (Unset), "" or "off" => enable all features * "legacy" => disable all features * * Any feature names read from files which match unames in spa_feature_table * will have the corresponding boolean set in the features array (if non-NULL). * If more than one feature set specified, only features present in *all* of * them will be set. * * "report" if not NULL will be populated with a suitable status message. * * Return values: * ZPOOL_COMPATIBILITY_OK : files read and parsed ok * ZPOOL_COMPATIBILITY_BADFILE : file too big or not a text file * ZPOOL_COMPATIBILITY_BADTOKEN : SYSCONF file contains invalid feature name * ZPOOL_COMPATIBILITY_WARNTOKEN : DATA file contains invalid feature name * ZPOOL_COMPATIBILITY_NOFILES : no feature files found */ zpool_compat_status_t zpool_load_compat(const char *compat, boolean_t *features, char *report, size_t rlen) { int sdirfd, ddirfd, featfd; struct stat fs; char *fc; char *ps, *ls, *ws; char *file, *line, *word; char l_compat[ZFS_MAXPROPLEN]; boolean_t ret_nofiles = B_TRUE; boolean_t ret_badfile = B_FALSE; boolean_t ret_badtoken = B_FALSE; boolean_t ret_warntoken = B_FALSE; /* special cases (unset), "" and "off" => enable all features */ if (compat == NULL || compat[0] == '\0' || strcmp(compat, ZPOOL_COMPAT_OFF) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; if (report != NULL) strlcpy(report, gettext("all features enabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* Final special case "legacy" => disable all features */ if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_FALSE; if (report != NULL) strlcpy(report, gettext("all features disabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* * Start with all true; will be ANDed with results from each file */ if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; char err_badfile[ZFS_MAXPROPLEN] = ""; char err_badtoken[ZFS_MAXPROPLEN] = ""; /* * We ignore errors from the directory open() * as they're only needed if the filename is relative * which will be checked during the openat(). */ /* O_PATH safer than O_RDONLY if system allows it */ #if defined(O_PATH) #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_PATH) #else #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_RDONLY) #endif sdirfd = open(ZPOOL_SYSCONF_COMPAT_D, ZC_DIR_FLAGS); ddirfd = open(ZPOOL_DATA_COMPAT_D, ZC_DIR_FLAGS); (void) strlcpy(l_compat, compat, ZFS_MAXPROPLEN); for (file = strtok_r(l_compat, ",", &ps); file != NULL; file = strtok_r(NULL, ",", &ps)) { boolean_t l_features[SPA_FEATURES]; enum { Z_SYSCONF, Z_DATA } source; /* try sysconfdir first, then datadir */ source = Z_SYSCONF; if ((featfd = openat(sdirfd, file, O_RDONLY | O_CLOEXEC)) < 0) { featfd = openat(ddirfd, file, O_RDONLY | O_CLOEXEC); source = Z_DATA; } /* File readable and correct size? */ if (featfd < 0 || fstat(featfd, &fs) < 0 || fs.st_size < 1 || fs.st_size > ZPOOL_COMPAT_MAXSIZE) { (void) close(featfd); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } /* Prefault the file if system allows */ #if defined(MAP_POPULATE) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) #elif defined(MAP_PREFAULT_READ) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_PREFAULT_READ) #else #define ZC_MMAP_FLAGS (MAP_PRIVATE) #endif /* private mmap() so we can strtok safely */ fc = (char *)mmap(NULL, fs.st_size, PROT_READ | PROT_WRITE, ZC_MMAP_FLAGS, featfd, 0); (void) close(featfd); /* map ok, and last character == newline? */ if (fc == MAP_FAILED || fc[fs.st_size - 1] != '\n') { (void) munmap((void *) fc, fs.st_size); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } ret_nofiles = B_FALSE; for (uint_t i = 0; i < SPA_FEATURES; i++) l_features[i] = B_FALSE; /* replace final newline with NULL to ensure string ends */ fc[fs.st_size - 1] = '\0'; for (line = strtok_r(fc, "\n", &ls); line != NULL; line = strtok_r(NULL, "\n", &ls)) { /* discard comments */ char *r = strchr(line, '#'); if (r != NULL) *r = '\0'; for (word = strtok_r(line, ", \t", &ws); word != NULL; word = strtok_r(NULL, ", \t", &ws)) { /* Find matching feature name */ uint_t f; for (f = 0; f < SPA_FEATURES; f++) { zfeature_info_t *fi = &spa_feature_table[f]; if (strcmp(word, fi->fi_uname) == 0) { l_features[f] = B_TRUE; break; } } if (f < SPA_FEATURES) continue; /* found an unrecognized word */ /* lightly sanitize it */ if (strlen(word) > 32) word[32] = '\0'; for (char *c = word; *c != '\0'; c++) if (!isprint(*c)) *c = '?'; strlcat(err_badtoken, word, ZFS_MAXPROPLEN); strlcat(err_badtoken, " ", ZFS_MAXPROPLEN); if (source == Z_SYSCONF) ret_badtoken = B_TRUE; else ret_warntoken = B_TRUE; } } (void) munmap((void *) fc, fs.st_size); if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] &= l_features[i]; } (void) close(sdirfd); (void) close(ddirfd); /* Return the most serious error */ if (ret_badfile) { if (report != NULL) snprintf(report, rlen, gettext("could not read/" "parse feature file(s): %s"), err_badfile); return (ZPOOL_COMPATIBILITY_BADFILE); } if (ret_nofiles) { if (report != NULL) strlcpy(report, gettext("no valid compatibility files specified"), rlen); return (ZPOOL_COMPATIBILITY_NOFILES); } if (ret_badtoken) { if (report != NULL) snprintf(report, rlen, gettext("invalid feature " "name(s) in local compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_BADTOKEN); } if (ret_warntoken) { if (report != NULL) snprintf(report, rlen, gettext("unrecognized feature " "name(s) in distribution compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_WARNTOKEN); } if (report != NULL) strlcpy(report, gettext("compatibility set ok"), rlen); return (ZPOOL_COMPATIBILITY_OK); } static int zpool_vdev_guid(zpool_handle_t *zhp, const char *vdevname, uint64_t *vdev_guid) { nvlist_t *tgt; boolean_t avail_spare, l2cache; verify(zhp != NULL); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "pool is in an unavailable state")); return (zfs_error(zhp->zpool_hdl, EZFS_POOLUNAVAIL, errbuf)); } if ((tgt = zpool_find_vdev(zhp, vdevname, &avail_spare, &l2cache, NULL)) == NULL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "can not find %s in %s"), vdevname, zhp->zpool_name); return (zfs_error(zhp->zpool_hdl, EZFS_NODEVICE, errbuf)); } *vdev_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); return (0); } /* * Get a vdev property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *nv; const char *strval; uint64_t intval; zprop_source_t src = ZPROP_SRC_NONE; if (prop == VDEV_PROP_USERPROP) { /* user property, prop_name must contain the property name */ assert(prop_name != NULL); if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { /* user prop not found */ return (-1); } (void) strlcpy(buf, strval, len); if (srctype) *srctype = src; return (0); } if (prop_name == NULL) prop_name = (char *)vdev_prop_to_name(prop); switch (vdev_prop_get_type(prop)) { case PROP_TYPE_STRING: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; if ((strval = vdev_prop_default_string(prop)) == NULL) strval = "-"; } (void) strlcpy(buf, strval, len); break; case PROP_TYPE_NUMBER: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); } switch (prop) { case VDEV_PROP_ASIZE: case VDEV_PROP_PSIZE: case VDEV_PROP_SIZE: case VDEV_PROP_BOOTSIZE: case VDEV_PROP_ALLOCATED: case VDEV_PROP_FREE: case VDEV_PROP_READ_ERRORS: case VDEV_PROP_WRITE_ERRORS: case VDEV_PROP_CHECKSUM_ERRORS: case VDEV_PROP_INITIALIZE_ERRORS: case VDEV_PROP_TRIM_ERRORS: case VDEV_PROP_SLOW_IOS: case VDEV_PROP_OPS_NULL: case VDEV_PROP_OPS_READ: case VDEV_PROP_OPS_WRITE: case VDEV_PROP_OPS_FREE: case VDEV_PROP_OPS_CLAIM: case VDEV_PROP_OPS_TRIM: case VDEV_PROP_BYTES_NULL: case VDEV_PROP_BYTES_READ: case VDEV_PROP_BYTES_WRITE: case VDEV_PROP_BYTES_FREE: case VDEV_PROP_BYTES_CLAIM: case VDEV_PROP_BYTES_TRIM: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_EXPANDSZ: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case VDEV_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_STATE: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) strlcpy(buf, zpool_state_to_name(intval, VDEV_AUX_NONE), len); } break; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { /* 'trim_support' only valid for leaf vdevs */ if (prop == VDEV_PROP_TRIM_SUPPORT) { (void) strlcpy(buf, "-", len); break; } src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Get a vdev property value for 'prop_name' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop(zpool_handle_t *zhp, const char *vdevname, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *reqnvl, *reqprops; nvlist_t *retprops = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&reqnvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&reqprops, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(reqnvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); if (prop != VDEV_PROP_USERPROP) { /* prop_name overrides prop value */ if (prop_name != NULL) prop = vdev_name_to_prop(prop_name); else prop_name = (char *)vdev_prop_to_name(prop); assert(prop < VDEV_NUM_PROPS); } assert(prop_name != NULL); if (nvlist_add_uint64(reqprops, prop_name, prop) != 0) { nvlist_free(reqnvl); nvlist_free(reqprops); return (no_memory(zhp->zpool_hdl)); } fnvlist_add_nvlist(reqnvl, ZPOOL_VDEV_PROPS_GET_PROPS, reqprops); ret = lzc_get_vdev_prop(zhp->zpool_name, reqnvl, &retprops); if (ret == 0) { ret = zpool_get_vdev_prop_value(retprops, prop, prop_name, buf, len, srctype, literal); } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev property %s from" " %s in %s"), prop_name, vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, ret, errbuf); } nvlist_free(reqnvl); nvlist_free(reqprops); nvlist_free(retprops); return (ret); } /* * Get all vdev properties */ int zpool_get_all_vdev_props(zpool_handle_t *zhp, const char *vdevname, nvlist_t **outnvl) { nvlist_t *nvl = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); ret = lzc_get_vdev_prop(zhp->zpool_name, nvl, outnvl); nvlist_free(nvl); if (ret) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev properties for" " %s in %s"), vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); } return (ret); } /* * Set vdev property */ int zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, const char *propname, const char *propval) { int ret; nvlist_t *nvl = NULL; nvlist_t *outnvl = NULL; nvlist_t *props; nvlist_t *realprops; prop_flags_t flags = { 0 }; uint64_t version; uint64_t vdev_guid; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_SET_VDEV, vdev_guid); if (nvlist_add_string(props, propname, propval) != 0) { nvlist_free(props); return (no_memory(zhp->zpool_hdl)); } char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property %s for %s on %s"), propname, vdevname, zhp->zpool_name); flags.vdevprop = 1; version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, props, version, flags, errbuf)) == NULL) { nvlist_free(props); nvlist_free(nvl); return (-1); } nvlist_free(props); props = realprops; fnvlist_add_nvlist(nvl, ZPOOL_VDEV_PROPS_SET_PROPS, props); ret = lzc_set_vdev_prop(zhp->zpool_name, nvl, &outnvl); nvlist_free(props); nvlist_free(nvl); nvlist_free(outnvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); return (ret); } + +/* + * Prune older entries from the DDT to reclaim space under the quota + */ +int +zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + int error = lzc_ddt_prune(zhp->zpool_name, unit, amount); + if (error != 0) { + libzfs_handle_t *hdl = zhp->zpool_hdl; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot prune dedup table on '%s'"), zhp->zpool_name); + + if (error == EALREADY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a prune operation is already in progress")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + return (-1); + } + + return (0); +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 1062a6b52dff..5ee6b8e09d6d 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -1,3036 +1,3051 @@ + + + + + + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index ec8b0ff4f61c..d07fca6cebad 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1,1929 +1,1951 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019 Datto Inc. */ /* * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. * It has the following characteristics: * * - Thread Safe. libzfs_core is accessible concurrently from multiple * threads. This is accomplished primarily by avoiding global data * (e.g. caching). Since it's thread-safe, there is no reason for a * process to have multiple libzfs "instances". Therefore, we store * our few pieces of data (e.g. the file descriptor) in global * variables. The fd is reference-counted so that the libzfs_core * library can be "initialized" multiple times (e.g. by different * consumers within the same process). * * - Committed Interface. The libzfs_core interface will be committed, * therefore consumers can compile against it and be confident that * their code will continue to work on future releases of this code. * Currently, the interface is Evolving (not Committed), but we intend * to commit to it once it is more complete and we determine that it * meets the needs of all consumers. * * - Programmatic Error Handling. libzfs_core communicates errors with * defined error numbers, and doesn't print anything to stdout/stderr. * * - Thin Layer. libzfs_core is a thin layer, marshaling arguments * to/from the kernel ioctls. There is generally a 1:1 correspondence * between libzfs_core functions and ioctls to ZFS_DEV. * * - Clear Atomicity. Because libzfs_core functions are generally 1:1 * with kernel ioctls, and kernel ioctls are general atomic, each * libzfs_core function is atomic. For example, creating multiple * snapshots with a single call to lzc_snapshot() is atomic -- it * can't fail with only some of the requested snapshots created, even * in the event of power loss or system crash. * * - Continued libzfs Support. Some higher-level operations (e.g. * support for "zfs send -R") are too complicated to fit the scope of * libzfs_core. This functionality will continue to live in libzfs. * Where appropriate, libzfs will use the underlying atomic operations * of libzfs_core. For example, libzfs may implement "zfs send -R | * zfs receive" by using individual "send one snapshot", rename, * destroy, and "receive one snapshot" operations in libzfs_core. * /sbin/zfs and /sbin/zpool will link with both libzfs and * libzfs_core. Other consumers should aim to use only libzfs_core, * since that will be the supported, stable interface going forwards. */ #include #include #include #include #include #ifdef ZFS_DEBUG #include #endif #include #include #include #include #include #include #include #include #include #if __FreeBSD__ #define BIG_PIPE_SIZE (64 * 1024) /* From sys/pipe.h */ #endif static int g_fd = -1; static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; static int g_refcount; #ifdef ZFS_DEBUG static zfs_ioc_t fail_ioc_cmd = ZFS_IOC_LAST; static zfs_errno_t fail_ioc_err; static void libzfs_core_debug_ioc(void) { /* * To test running newer user space binaries with kernel's * that don't yet support an ioctl or a new ioctl arg we * provide an override to intentionally fail an ioctl. * * USAGE: * The override variable, ZFS_IOC_TEST, is of the form "cmd:err" * * For example, to fail a ZFS_IOC_POOL_CHECKPOINT with a * ZFS_ERR_IOC_CMD_UNAVAIL, the string would be "0x5a4d:1029" * * $ sudo sh -c "ZFS_IOC_TEST=0x5a4d:1029 zpool checkpoint tank" * cannot checkpoint 'tank': the loaded zfs module does not support * this operation. A reboot may be required to enable this operation. */ if (fail_ioc_cmd == ZFS_IOC_LAST) { char *ioc_test = getenv("ZFS_IOC_TEST"); unsigned int ioc_num = 0, ioc_err = 0; if (ioc_test != NULL && sscanf(ioc_test, "%i:%i", &ioc_num, &ioc_err) == 2 && ioc_num < ZFS_IOC_LAST) { fail_ioc_cmd = ioc_num; fail_ioc_err = ioc_err; } } } #endif int libzfs_core_init(void) { (void) pthread_mutex_lock(&g_lock); if (g_refcount == 0) { g_fd = open(ZFS_DEV, O_RDWR|O_CLOEXEC); if (g_fd < 0) { (void) pthread_mutex_unlock(&g_lock); return (errno); } } g_refcount++; #ifdef ZFS_DEBUG libzfs_core_debug_ioc(); #endif (void) pthread_mutex_unlock(&g_lock); return (0); } void libzfs_core_fini(void) { (void) pthread_mutex_lock(&g_lock); ASSERT3S(g_refcount, >, 0); g_refcount--; if (g_refcount == 0 && g_fd != -1) { (void) close(g_fd); g_fd = -1; } (void) pthread_mutex_unlock(&g_lock); } static int lzc_ioctl(zfs_ioc_t ioc, const char *name, nvlist_t *source, nvlist_t **resultp) { zfs_cmd_t zc = {"\0"}; int error = 0; char *packed = NULL; size_t size = 0; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); #ifdef ZFS_DEBUG if (ioc == fail_ioc_cmd) return (fail_ioc_err); #endif if (name != NULL) (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); if (source != NULL) { packed = fnvlist_pack(source, &size); zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_src_size = size; } if (resultp != NULL) { *resultp = NULL; if (ioc == ZFS_IOC_CHANNEL_PROGRAM) { zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source, ZCP_ARG_MEMLIMIT); } else { zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); if (zc.zc_nvlist_dst == (uint64_t)0) { error = ENOMEM; goto out; } } while (lzc_ioctl_fd(g_fd, ioc, &zc) != 0) { /* * If ioctl exited with ENOMEM, we retry the ioctl after * increasing the size of the destination nvlist. * * Channel programs that exit with ENOMEM ran over the * lua memory sandbox; they should not be retried. */ if (errno == ENOMEM && resultp != NULL && ioc != ZFS_IOC_CHANNEL_PROGRAM) { free((void *)(uintptr_t)zc.zc_nvlist_dst); zc.zc_nvlist_dst_size *= 2; zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); if (zc.zc_nvlist_dst == (uint64_t)0) { error = ENOMEM; goto out; } } else { error = errno; break; } } if (zc.zc_nvlist_dst_filled && resultp != NULL) { *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size); } out: if (packed != NULL) fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); return (error); } int lzc_scrub(zfs_ioc_t ioc, const char *name, nvlist_t *source, nvlist_t **resultp) { return (lzc_ioctl(ioc, name, source, resultp)); } int lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *hidden_args = NULL; nvlist_t *args = fnvlist_alloc(); fnvlist_add_int32(args, "type", (dmu_objset_type_t)type); if (props != NULL) fnvlist_add_nvlist(args, "props", props); if (wkeydata != NULL) { hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(args, ZPOOL_HIDDEN_ARGS, hidden_args); } error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); nvlist_free(hidden_args); nvlist_free(args); return (error); } int lzc_clone(const char *fsname, const char *origin, nvlist_t *props) { int error; nvlist_t *hidden_args = NULL; nvlist_t *args = fnvlist_alloc(); fnvlist_add_string(args, "origin", origin); if (props != NULL) fnvlist_add_nvlist(args, "props", props); error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); nvlist_free(hidden_args); nvlist_free(args); return (error); } int lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) { /* * The promote ioctl is still legacy, so we need to construct our * own zfs_cmd_t rather than using lzc_ioctl(). */ zfs_cmd_t zc = {"\0"}; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); if (lzc_ioctl_fd(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { int error = errno; if (error == EEXIST && snapnamebuf != NULL) (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen); return (error); } return (0); } int lzc_rename(const char *source, const char *target) { zfs_cmd_t zc = {"\0"}; int error; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); error = lzc_ioctl_fd(g_fd, ZFS_IOC_RENAME, &zc); if (error != 0) error = errno; return (error); } int lzc_destroy(const char *fsname) { int error; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL); nvlist_free(args); return (error); } /* * Creates snapshots. * * The keys in the snaps nvlist are the snapshots to be created. * They must all be in the same pool. * * The props nvlist is properties to set. Currently only user properties * are supported. { user:prop_name -> string value } * * The returned results nvlist will have an entry for each snapshot that failed. * The value will be the (int32) error code. * * The return value will be 0 if all snapshots were created, otherwise it will * be the errno of a (unspecified) snapshot that failed. */ int lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) { nvpair_t *elem; nvlist_t *args; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; *errlist = NULL; /* determine the pool name */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "snaps", snaps); if (props != NULL) fnvlist_add_nvlist(args, "props", props); error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); nvlist_free(args); return (error); } /* * Destroys snapshots. * * The keys in the snaps nvlist are the snapshots to be destroyed. * They must all be in the same pool. * * Snapshots that do not exist will be silently ignored. * * If 'defer' is not set, and a snapshot has user holds or clones, the * destroy operation will fail and none of the snapshots will be * destroyed. * * If 'defer' is set, and a snapshot has user holds or clones, it will be * marked for deferred destruction, and will be destroyed when the last hold * or clone is removed/destroyed. * * The return value will be 0 if all snapshots were destroyed (or marked for * later destruction if 'defer' is set) or didn't exist to begin with. * * Otherwise the return value will be the errno of a (unspecified) snapshot * that failed, no snapshots will be destroyed, and the errlist will have an * entry for each snapshot that failed. The value in the errlist will be * the (int32) error code. */ int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) { nvpair_t *elem; nvlist_t *args; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine the pool name */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "snaps", snaps); if (defer) fnvlist_add_boolean(args, "defer"); error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); nvlist_free(args); return (error); } int lzc_snaprange_space(const char *firstsnap, const char *lastsnap, uint64_t *usedp) { nvlist_t *args; nvlist_t *result; int err; char fs[ZFS_MAX_DATASET_NAME_LEN]; char *atp; /* determine the fs name */ (void) strlcpy(fs, firstsnap, sizeof (fs)); atp = strchr(fs, '@'); if (atp == NULL) return (EINVAL); *atp = '\0'; args = fnvlist_alloc(); fnvlist_add_string(args, "firstsnap", firstsnap); err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); nvlist_free(args); if (err == 0) *usedp = fnvlist_lookup_uint64(result, "used"); fnvlist_free(result); return (err); } boolean_t lzc_exists(const char *dataset) { /* * The objset_stats ioctl is still legacy, so we need to construct our * own zfs_cmd_t rather than using lzc_ioctl(). */ zfs_cmd_t zc = {"\0"}; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); return (lzc_ioctl_fd(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); } /* * outnvl is unused. * It was added to preserve the function signature in case it is * needed in the future. */ int lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) { (void) outnvl; return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); } /* * Create "user holds" on snapshots. If there is a hold on a snapshot, * the snapshot can not be destroyed. (However, it can be marked for deletion * by lzc_destroy_snaps(defer=B_TRUE).) * * The keys in the nvlist are snapshot names. * The snapshots must all be in the same pool. * The value is the name of the hold (string type). * * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL). * In this case, when the cleanup_fd is closed (including on process * termination), the holds will be released. If the system is shut down * uncleanly, the holds will be released when the pool is next opened * or imported. * * Holds for snapshots which don't exist will be skipped and have an entry * added to errlist, but will not cause an overall failure. * * The return value will be 0 if all holds, for snapshots that existed, * were successfully created. * * Otherwise the return value will be the errno of a (unspecified) hold that * failed and no holds will be created. * * In all cases the errlist will have an entry for each hold that failed * (name = snapshot), with its value being the error code (int32). */ int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) { char pool[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *args; nvpair_t *elem; int error; /* determine the pool name */ elem = nvlist_next_nvpair(holds, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "holds", holds); if (cleanup_fd != -1) fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); nvlist_free(args); return (error); } /* * Release "user holds" on snapshots. If the snapshot has been marked for * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have * any clones, and all the user holds are removed, then the snapshot will be * destroyed. * * The keys in the nvlist are snapshot names. * The snapshots must all be in the same pool. * The value is an nvlist whose keys are the holds to remove. * * Holds which failed to release because they didn't exist will have an entry * added to errlist, but will not cause an overall failure. * * The return value will be 0 if the nvl holds was empty or all holds that * existed, were successfully removed. * * Otherwise the return value will be the errno of a (unspecified) hold that * failed to release and no holds will be released. * * In all cases the errlist will have an entry for each hold that failed to * to release. */ int lzc_release(nvlist_t *holds, nvlist_t **errlist) { char pool[ZFS_MAX_DATASET_NAME_LEN]; nvpair_t *elem; /* determine the pool name */ elem = nvlist_next_nvpair(holds, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); } /* * Retrieve list of user holds on the specified snapshot. * * On success, *holdsp will be set to an nvlist which the caller must free. * The keys are the names of the holds, and the value is the creation time * of the hold (uint64) in seconds since the epoch. */ int lzc_get_holds(const char *snapname, nvlist_t **holdsp) { return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp)); } int lzc_get_props(const char *poolname, nvlist_t **props) { return (lzc_ioctl(ZFS_IOC_POOL_GET_PROPS, poolname, NULL, props)); } static unsigned int max_pipe_buffer(int infd) { #if __linux__ static unsigned int max; if (max == 0) { max = 1048576; /* fs/pipe.c default */ FILE *procf = fopen("/proc/sys/fs/pipe-max-size", "re"); if (procf != NULL) { if (fscanf(procf, "%u", &max) <= 0) { /* ignore error: max untouched if parse fails */ } fclose(procf); } } unsigned int cur = fcntl(infd, F_GETPIPE_SZ); /* * Sadly, Linux has an unfixed deadlock if you do SETPIPE_SZ on a pipe * with data in it. * cf. #13232, https://bugzilla.kernel.org/show_bug.cgi?id=212295 * * And since the problem is in waking up the writer, there's nothing * we can do about it from here. * * So if people want to, they can set this, but they * may regret it... */ if (getenv("ZFS_SET_PIPE_MAX") == NULL) return (cur); if (cur < max && fcntl(infd, F_SETPIPE_SZ, max) != -1) cur = max; return (cur); #else /* FreeBSD automatically resizes */ (void) infd; return (BIG_PIPE_SIZE); #endif } #if __linux__ struct send_worker_ctx { int from; /* read end of pipe, with send data; closed on exit */ int to; /* original arbitrary output fd; mustn't be a pipe */ }; static void * send_worker(void *arg) { struct send_worker_ctx *ctx = arg; unsigned int bufsiz = max_pipe_buffer(ctx->from); ssize_t rd; for (;;) { rd = splice(ctx->from, NULL, ctx->to, NULL, bufsiz, SPLICE_F_MOVE | SPLICE_F_MORE); if ((rd == -1 && errno != EINTR) || rd == 0) break; } int err = (rd == -1) ? errno : 0; close(ctx->from); return ((void *)(uintptr_t)err); } #endif /* * Since Linux 5.10, 4d03e3cc59828c82ee89ea6e27a2f3cdf95aaadf * ("fs: don't allow kernel reads and writes without iter ops"), * ZFS_IOC_SEND* will EINVAL when writing to /dev/null, /dev/zero, &c. * * This wrapper transparently executes func() with a pipe * by spawning a thread to copy from that pipe to the original output * in the background. * * Returns the error from func(), if nonzero, * otherwise the error from the thread. * * No-op if orig_fd is -1, already a pipe (but the buffer size is bumped), * and on not-Linux; as such, it is safe to wrap/call wrapped functions * in a wrapped context. */ int lzc_send_wrapper(int (*func)(int, void *), int orig_fd, void *data) { #if __linux__ struct stat sb; if (orig_fd != -1 && fstat(orig_fd, &sb) == -1) return (errno); if (orig_fd == -1 || S_ISFIFO(sb.st_mode)) { if (orig_fd != -1) (void) max_pipe_buffer(orig_fd); return (func(orig_fd, data)); } if ((fcntl(orig_fd, F_GETFL) & O_ACCMODE) == O_RDONLY) return (errno = EBADF); int rw[2]; if (pipe2(rw, O_CLOEXEC) == -1) return (errno); int err; pthread_t send_thread; struct send_worker_ctx ctx = {.from = rw[0], .to = orig_fd}; if ((err = pthread_create(&send_thread, NULL, send_worker, &ctx)) != 0) { close(rw[0]); close(rw[1]); return (errno = err); } err = func(rw[1], data); void *send_err; close(rw[1]); pthread_join(send_thread, &send_err); if (err == 0 && send_err != 0) errno = err = (uintptr_t)send_err; return (err); #else return (func(orig_fd, data)); #endif } /* * Generate a zfs send stream for the specified snapshot and write it to * the specified file descriptor. * * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") * * If "from" is NULL, a full (non-incremental) stream will be sent. * If "from" is non-NULL, it must be the full name of a snapshot or * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or * bookmark must represent an earlier point in the history of "snapname"). * It can be an earlier snapshot in the same filesystem or zvol as "snapname", * or it can be the origin of "snapname"'s filesystem, or an earlier * snapshot in the origin, etc. * * "fd" is the file descriptor to write the send stream to. * * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT * records with drr_blksz > 128K. * * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, * which the receiving system must support (as indicated by support * for the "embedded_data" feature). * * If "flags" contains LZC_SEND_FLAG_COMPRESS, the stream is generated by using * compressed WRITE records for blocks which are compressed on disk and in * memory. If the lz4_compress feature is active on the sending system, then * the receiving system must have that feature enabled as well. * * If "flags" contains LZC_SEND_FLAG_RAW, the stream is generated, for encrypted * datasets, by sending data exactly as it exists on disk. This allows backups * to be taken even if encryption keys are not currently loaded. */ int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) { return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, NULL)); } int lzc_send_redacted(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, const char *redactbook) { return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, redactbook)); } int lzc_send_resume(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) { return (lzc_send_resume_redacted(snapname, from, fd, flags, resumeobj, resumeoff, NULL)); } /* * snapname: The name of the "tosnap", or the snapshot whose contents we are * sending. * from: The name of the "fromsnap", or the incremental source. * fd: File descriptor to write the stream to. * flags: flags that determine features to be used by the stream. * resumeobj: Object to resume from, for resuming send * resumeoff: Offset to resume from, for resuming send. * redactnv: nvlist of string -> boolean(ignored) containing the names of all * the snapshots that we should redact with respect to. * redactbook: Name of the redaction bookmark to create. * * Pre-wrapped. */ static int lzc_send_resume_redacted_cb_impl(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, const char *redactbook) { nvlist_t *args; int err; args = fnvlist_alloc(); fnvlist_add_int32(args, "fd", fd); if (from != NULL) fnvlist_add_string(args, "fromsnap", from); if (flags & LZC_SEND_FLAG_LARGE_BLOCK) fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); if (flags & LZC_SEND_FLAG_COMPRESS) fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); if (flags & LZC_SEND_FLAG_SAVED) fnvlist_add_boolean(args, "savedok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); } if (redactbook != NULL) fnvlist_add_string(args, "redactbook", redactbook); err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); } struct lzc_send_resume_redacted { const char *snapname; const char *from; enum lzc_send_flags flags; uint64_t resumeobj; uint64_t resumeoff; const char *redactbook; }; static int lzc_send_resume_redacted_cb(int fd, void *arg) { struct lzc_send_resume_redacted *zsrr = arg; return (lzc_send_resume_redacted_cb_impl(zsrr->snapname, zsrr->from, fd, zsrr->flags, zsrr->resumeobj, zsrr->resumeoff, zsrr->redactbook)); } int lzc_send_resume_redacted(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, const char *redactbook) { struct lzc_send_resume_redacted zsrr = { .snapname = snapname, .from = from, .flags = flags, .resumeobj = resumeobj, .resumeoff = resumeoff, .redactbook = redactbook, }; return (lzc_send_wrapper(lzc_send_resume_redacted_cb, fd, &zsrr)); } /* * "from" can be NULL, a snapshot, or a bookmark. * * If from is NULL, a full (non-incremental) stream will be estimated. This * is calculated very efficiently. * * If from is a snapshot, lzc_send_space uses the deadlists attached to * each snapshot to efficiently estimate the stream size. * * If from is a bookmark, the indirect blocks in the destination snapshot * are traversed, looking for blocks with a birth time since the creation TXG of * the snapshot this bookmark was created from. This will result in * significantly more I/O and be less efficient than a send space estimation on * an equivalent snapshot. This process is also used if redact_snaps is * non-null. * * Pre-wrapped. */ static int lzc_send_space_resume_redacted_cb_impl(const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); if (from != NULL) fnvlist_add_string(args, "from", from); if (flags & LZC_SEND_FLAG_LARGE_BLOCK) fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); if (flags & LZC_SEND_FLAG_COMPRESS) fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); fnvlist_add_uint64(args, "bytes", resume_bytes); } if (redactbook != NULL) fnvlist_add_string(args, "redactbook", redactbook); if (fd != -1) fnvlist_add_int32(args, "fd", fd); err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) *spacep = fnvlist_lookup_uint64(result, "space"); nvlist_free(result); return (err); } struct lzc_send_space_resume_redacted { const char *snapname; const char *from; enum lzc_send_flags flags; uint64_t resumeobj; uint64_t resumeoff; uint64_t resume_bytes; const char *redactbook; uint64_t *spacep; }; static int lzc_send_space_resume_redacted_cb(int fd, void *arg) { struct lzc_send_space_resume_redacted *zssrr = arg; return (lzc_send_space_resume_redacted_cb_impl(zssrr->snapname, zssrr->from, zssrr->flags, zssrr->resumeobj, zssrr->resumeoff, zssrr->resume_bytes, zssrr->redactbook, fd, zssrr->spacep)); } int lzc_send_space_resume_redacted(const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) { struct lzc_send_space_resume_redacted zssrr = { .snapname = snapname, .from = from, .flags = flags, .resumeobj = resumeobj, .resumeoff = resumeoff, .resume_bytes = resume_bytes, .redactbook = redactbook, .spacep = spacep, }; return (lzc_send_wrapper(lzc_send_space_resume_redacted_cb, fd, &zssrr)); } int lzc_send_space(const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { return (lzc_send_space_resume_redacted(snapname, from, flags, 0, 0, 0, NULL, -1, spacep)); } static int recv_read(int fd, void *buf, int ilen) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) return (EIO); return (0); } /* * Linux adds ZFS_IOC_RECV_NEW for resumable and raw streams and preserves the * legacy ZFS_IOC_RECV user/kernel interface. The new interface supports all * stream options but is currently only used for resumable streams. This way * updated user space utilities will interoperate with older kernel modules. * * Non-Linux OpenZFS platforms have opted to modify the legacy interface. */ static int recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t heal, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_replay_record_t drr; char fsname[MAXPATHLEN]; char *atp; int error; boolean_t payload = B_FALSE; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); /* Set 'fsname' to the name of containing filesystem */ (void) strlcpy(fsname, snapname, sizeof (fsname)); atp = strchr(fsname, '@'); if (atp == NULL) return (EINVAL); *atp = '\0'; /* If the fs does not exist, try its parent. */ if (!lzc_exists(fsname)) { char *slashp = strrchr(fsname, '/'); if (slashp == NULL) return (ENOENT); *slashp = '\0'; } /* * It is not uncommon for gigabytes to be processed by zfs receive. * Speculatively increase the buffer size if supported by the platform. */ struct stat sb; if (fstat(input_fd, &sb) == -1) return (errno); if (S_ISFIFO(sb.st_mode)) (void) max_pipe_buffer(input_fd); /* * The begin_record is normally a non-byteswapped BEGIN record. * For resumable streams it may be set to any non-byteswapped * dmu_replay_record_t. */ if (begin_record == NULL) { error = recv_read(input_fd, &drr, sizeof (drr)); if (error != 0) return (error); } else { drr = *begin_record; payload = (begin_record->drr_payloadlen != 0); } /* * All receives with a payload should use the new interface. */ if (resumable || heal || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string(innvl, "snapname", snapname); if (recvdprops != NULL) fnvlist_add_nvlist(innvl, "props", recvdprops); if (localprops != NULL) fnvlist_add_nvlist(innvl, "localprops", localprops); if (wkeydata != NULL) { /* * wkeydata must be placed in the special * ZPOOL_HIDDEN_ARGS nvlist so that it * will not be printed to the zpool history. */ nvlist_t *hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(innvl, ZPOOL_HIDDEN_ARGS, hidden_args); nvlist_free(hidden_args); } if (origin != NULL && strlen(origin)) fnvlist_add_string(innvl, "origin", origin); fnvlist_add_byte_array(innvl, "begin_record", (uchar_t *)&drr, sizeof (drr)); fnvlist_add_int32(innvl, "input_fd", input_fd); if (force) fnvlist_add_boolean(innvl, "force"); if (resumable) fnvlist_add_boolean(innvl, "resumable"); if (heal) fnvlist_add_boolean(innvl, "heal"); error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl); if (error == 0 && read_bytes != NULL) error = nvlist_lookup_uint64(outnvl, "read_bytes", read_bytes); if (error == 0 && errflags != NULL) error = nvlist_lookup_uint64(outnvl, "error_flags", errflags); if (error == 0 && errors != NULL) { nvlist_t *nvl; error = nvlist_lookup_nvlist(outnvl, "errors", &nvl); if (error == 0) *errors = fnvlist_dup(nvl); } fnvlist_free(innvl); fnvlist_free(outnvl); } else { zfs_cmd_t zc = {"\0"}; char *rp_packed = NULL; char *lp_packed = NULL; size_t size; ASSERT3S(g_refcount, >, 0); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); if (recvdprops != NULL) { rp_packed = fnvlist_pack(recvdprops, &size); zc.zc_nvlist_src = (uint64_t)(uintptr_t)rp_packed; zc.zc_nvlist_src_size = size; } if (localprops != NULL) { lp_packed = fnvlist_pack(localprops, &size); zc.zc_nvlist_conf = (uint64_t)(uintptr_t)lp_packed; zc.zc_nvlist_conf_size = size; } if (origin != NULL) (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); ASSERT3S(drr.drr_type, ==, DRR_BEGIN); zc.zc_begin_record = drr.drr_u.drr_begin; zc.zc_guid = force; zc.zc_cookie = input_fd; zc.zc_cleanup_fd = -1; zc.zc_action_handle = 0; zc.zc_nvlist_dst_size = 128 * 1024; zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); error = lzc_ioctl_fd(g_fd, ZFS_IOC_RECV, &zc); if (error != 0) { error = errno; } else { if (read_bytes != NULL) *read_bytes = zc.zc_cookie; if (errflags != NULL) *errflags = zc.zc_obj; if (errors != NULL) VERIFY0(nvlist_unpack( (void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size, errors, KM_SLEEP)); } if (rp_packed != NULL) fnvlist_pack_free(rp_packed, size); if (lp_packed != NULL) fnvlist_pack_free(lp_packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); } return (error); } /* * The simplest receive case: receive from the specified fd, creating the * specified snapshot. Apply the specified properties as "received" properties * (which can be overridden by locally-set properties). If the stream is a * clone, its origin snapshot must be specified by 'origin'. The 'force' * flag will cause the target filesystem to be rolled back or destroyed if * necessary to receive. * * Return 0 on success or an errno on failure. * * Note: this interface does not work on dedup'd streams * (those with DMU_BACKUP_FEATURE_DEDUP). */ int lzc_receive(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_FALSE, B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); } /* * Like lzc_receive, but if the receive fails due to premature stream * termination, the intermediate state will be preserved on disk. In this * case, ECKSUM will be returned. The receive may subsequently be resumed * with a resuming send stream generated by lzc_send_resume(). */ int lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_FALSE, B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); } /* * Like lzc_receive, but allows the caller to read the begin record and then to * pass it in. That could be useful if the caller wants to derive, for example, * the snapname or the origin parameters based on the information contained in * the begin record. * The begin record must be in its original form as read from the stream, * in other words, it should not be byteswapped. * * The 'resumable' parameter allows to obtain the same behavior as with * lzc_receive_resumable. */ int lzc_receive_with_header(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int fd, const dmu_replay_record_t *begin_record) { if (begin_record == NULL) return (EINVAL); return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_FALSE, resumable, raw, fd, begin_record, NULL, NULL, NULL)); } /* * Like lzc_receive, but allows the caller to pass all supported arguments * and retrieve all values returned. The only additional input parameter * is 'cleanup_fd' which is used to set a cleanup-on-exit file descriptor. * * The following parameters all provide return values. Several may be set * in the failure case and will contain additional information. * * The 'read_bytes' value will be set to the total number of bytes read. * * The 'errflags' value will contain zprop_errflags_t flags which are * used to describe any failures. * * The 'action_handle' and 'cleanup_fd' are no longer used, and are ignored. * * The 'errors' nvlist contains an entry for each unapplied received * property. Callers are responsible for freeing this nvlist. */ int lzc_receive_one(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { (void) action_handle, (void) cleanup_fd; return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_FALSE, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } /* * Like lzc_receive_one, but allows the caller to pass an additional 'cmdprops' * argument. * * The 'cmdprops' nvlist contains both override ('zfs receive -o') and * exclude ('zfs receive -x') properties. Callers are responsible for freeing * this nvlist */ int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { (void) action_handle, (void) cleanup_fd; return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, force, B_FALSE, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } /* * Like lzc_receive_with_cmdprops, but allows the caller to pass an additional * 'heal' argument. * * The heal arguments tells us to heal the provided snapshot using the provided * send stream */ int lzc_receive_with_heal(const char *snapname, nvlist_t *props, nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t heal, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { (void) action_handle, (void) cleanup_fd; return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, force, heal, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } /* * Roll back this filesystem or volume to its most recent snapshot. * If snapnamebuf is not NULL, it will be filled in with the name * of the most recent snapshot. * Note that the latest snapshot may change if a new one is concurrently * created or the current one is destroyed. lzc_rollback_to can be used * to roll back to a specific latest snapshot. * * Return 0 on success or an errno on failure. */ int lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); nvlist_free(args); if (err == 0 && snapnamebuf != NULL) { const char *snapname = fnvlist_lookup_string(result, "target"); (void) strlcpy(snapnamebuf, snapname, snapnamelen); } nvlist_free(result); return (err); } /* * Roll back this filesystem or volume to the specified snapshot, * if possible. * * Return 0 on success or an errno on failure. */ int lzc_rollback_to(const char *fsname, const char *snapname) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); fnvlist_add_string(args, "target", snapname); err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); nvlist_free(args); nvlist_free(result); return (err); } /* * Creates new bookmarks from existing snapshot or bookmark. * * The bookmarks nvlist maps from the full name of the new bookmark to * the full name of the source snapshot or bookmark. * All the bookmarks and snapshots must be in the same pool. * The new bookmarks names must be unique. * => see function dsl_bookmark_create_nvl_validate * * The returned results nvlist will have an entry for each bookmark that failed. * The value will be the (int32) error code. * * The return value will be 0 if all bookmarks were created, otherwise it will * be the errno of a (undetermined) bookmarks that failed. */ int lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) { nvpair_t *elem; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine pool name from first bookmark */ elem = nvlist_next_nvpair(bookmarks, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/#")] = '\0'; error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist); return (error); } /* * Retrieve bookmarks. * * Retrieve the list of bookmarks for the given file system. The props * parameter is an nvlist of property names (with no values) that will be * returned for each bookmark. * * The following are valid properties on bookmarks, most of which are numbers * (represented as uint64 in the nvlist), except redact_snaps, which is a * uint64 array, and redact_complete, which is a boolean * * "guid" - globally unique identifier of the snapshot it refers to * "createtxg" - txg when the snapshot it refers to was created * "creation" - timestamp when the snapshot it refers to was created * "ivsetguid" - IVset guid for identifying encrypted snapshots * "redact_snaps" - list of guids of the redaction snapshots for the specified * bookmark. If the bookmark is not a redaction bookmark, the nvlist will * not contain an entry for this value. If it is redacted with respect to * no snapshots, it will contain value -> NULL uint64 array * "redact_complete" - boolean value; true if the redaction bookmark is * complete, false otherwise. * * The format of the returned nvlist as follows: * -> { * -> { * "value" -> uint64 * } * ... * "redact_snaps" -> { * "value" -> uint64 array * } * "redact_complete" -> { * "value" -> boolean value * } * } */ int lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) { return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); } /* * Get bookmark properties. * * Given a bookmark's full name, retrieve all properties for the bookmark. * * The format of the returned property list is as follows: * { * -> { * "value" -> uint64 * } * ... * "redact_snaps" -> { * "value" -> uint64 array * } */ int lzc_get_bookmark_props(const char *bookmark, nvlist_t **props) { int error; nvlist_t *innvl = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, innvl, props); fnvlist_free(innvl); return (error); } /* * Destroys bookmarks. * * The keys in the bmarks nvlist are the bookmarks to be destroyed. * They must all be in the same pool. Bookmarks are specified as * #. * * Bookmarks that do not exist will be silently ignored. * * The return value will be 0 if all bookmarks that existed were destroyed. * * Otherwise the return value will be the errno of a (undetermined) bookmark * that failed, no bookmarks will be destroyed, and the errlist will have an * entry for each bookmarks that failed. The value in the errlist will be * the (int32) error code. */ int lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist) { nvpair_t *elem; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine the pool name */ elem = nvlist_next_nvpair(bmarks, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/#")] = '\0'; error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist); return (error); } static int lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync, uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { int error; nvlist_t *args; args = fnvlist_alloc(); fnvlist_add_string(args, ZCP_ARG_PROGRAM, program); fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl); fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync); fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit); fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit); error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl); fnvlist_free(args); return (error); } /* * Executes a channel program. * * If this function returns 0 the channel program was successfully loaded and * ran without failing. Note that individual commands the channel program ran * may have failed and the channel program is responsible for reporting such * errors through outnvl if they are important. * * This method may also return: * * EINVAL The program contains syntax errors, or an invalid memory or time * limit was given. No part of the channel program was executed. * If caused by syntax errors, 'outnvl' contains information about the * errors. * * ECHRNG The program was executed, but encountered a runtime error, such as * calling a function with incorrect arguments, invoking the error() * function directly, failing an assert() command, etc. Some portion * of the channel program may have executed and committed changes. * Information about the failure can be found in 'outnvl'. * * ENOMEM The program fully executed, but the output buffer was not large * enough to store the returned value. No output is returned through * 'outnvl'. * * ENOSPC The program was terminated because it exceeded its memory usage * limit. Some portion of the channel program may have executed and * committed changes to disk. No output is returned through 'outnvl'. * * ETIME The program was terminated because it exceeded its Lua instruction * limit. Some portion of the channel program may have executed and * committed changes to disk. No output is returned through 'outnvl'. */ int lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit, memlimit, argnvl, outnvl)); } /* * Creates a checkpoint for the specified pool. * * If this function returns 0 the pool was successfully checkpointed. * * This method may also return: * * ZFS_ERR_CHECKPOINT_EXISTS * The pool already has a checkpoint. A pools can only have one * checkpoint at most, at any given time. * * ZFS_ERR_DISCARDING_CHECKPOINT * ZFS is in the middle of discarding a checkpoint for this pool. * The pool can be checkpointed again once the discard is done. * * ZFS_DEVRM_IN_PROGRESS * A vdev is currently being removed. The pool cannot be * checkpointed until the device removal is done. * * ZFS_VDEV_TOO_BIG * One or more top-level vdevs exceed the maximum vdev size * supported for this feature. */ int lzc_pool_checkpoint(const char *pool) { int error; nvlist_t *result = NULL; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Discard the checkpoint from the specified pool. * * If this function returns 0 the checkpoint was successfully discarded. * * This method may also return: * * ZFS_ERR_NO_CHECKPOINT * The pool does not have a checkpoint. * * ZFS_ERR_DISCARDING_CHECKPOINT * ZFS is already in the middle of discarding the checkpoint. */ int lzc_pool_checkpoint_discard(const char *pool) { int error; nvlist_t *result = NULL; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Load the requested data type for the specified pool. */ int lzc_pool_prefetch(const char *pool, zpool_prefetch_type_t type) { int error; nvlist_t *result = NULL; nvlist_t *args = fnvlist_alloc(); fnvlist_add_int32(args, ZPOOL_PREFETCH_TYPE, type); error = lzc_ioctl(ZFS_IOC_POOL_PREFETCH, pool, args, &result); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Executes a read-only channel program. * * A read-only channel program works programmatically the same way as a * normal channel program executed with lzc_channel_program(). The only * difference is it runs exclusively in open-context and therefore can * return faster. The downside to that, is that the program cannot change * on-disk state by calling functions from the zfs.sync submodule. * * The return values of this function (and their meaning) are exactly the * same as the ones described in lzc_channel_program(). */ int lzc_channel_program_nosync(const char *pool, const char *program, uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { return (lzc_channel_program_impl(pool, program, B_FALSE, timeout, memlimit, argnvl, outnvl)); } int lzc_get_vdev_prop(const char *poolname, nvlist_t *innvl, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_VDEV_GET_PROPS, poolname, innvl, outnvl)); } int lzc_set_vdev_prop(const char *poolname, nvlist_t *innvl, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_VDEV_SET_PROPS, poolname, innvl, outnvl)); } /* * Performs key management functions * * crypto_cmd should be a value from dcp_cmd_t. If the command specifies to * load or change a wrapping key, the key should be specified in the * hidden_args nvlist so that it is not logged. */ int lzc_load_key(const char *fsname, boolean_t noop, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *ioc_args; nvlist_t *hidden_args; if (wkeydata == NULL) return (EINVAL); ioc_args = fnvlist_alloc(); hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); if (noop) fnvlist_add_boolean(ioc_args, "noop"); error = lzc_ioctl(ZFS_IOC_LOAD_KEY, fsname, ioc_args, NULL); nvlist_free(hidden_args); nvlist_free(ioc_args); return (error); } int lzc_unload_key(const char *fsname) { return (lzc_ioctl(ZFS_IOC_UNLOAD_KEY, fsname, NULL, NULL)); } int lzc_change_key(const char *fsname, uint64_t crypt_cmd, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *ioc_args = fnvlist_alloc(); nvlist_t *hidden_args = NULL; fnvlist_add_uint64(ioc_args, "crypt_cmd", crypt_cmd); if (wkeydata != NULL) { hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); } if (props != NULL) fnvlist_add_nvlist(ioc_args, "props", props); error = lzc_ioctl(ZFS_IOC_CHANGE_KEY, fsname, ioc_args, NULL); nvlist_free(hidden_args); nvlist_free(ioc_args); return (error); } int lzc_reopen(const char *pool_name, boolean_t scrub_restart) { nvlist_t *args = fnvlist_alloc(); int error; fnvlist_add_boolean_value(args, "scrub_restart", scrub_restart); error = lzc_ioctl(ZFS_IOC_POOL_REOPEN, pool_name, args, NULL); nvlist_free(args); return (error); } /* * Changes initializing state. * * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. * The key is ignored. * * If there are errors related to vdev arguments, per-vdev errors are returned * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where * guid is stringified with PRIu64, and errno is one of the following as * an int64_t: * - ENODEV if the device was not found * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) * - EROFS if the device is not writeable * - EBUSY start requested but the device is already being either * initialized or trimmed * - ESRCH cancel/suspend requested but device is not being initialized * * If the errlist is empty, then return value will be: * - EINVAL if one or more arguments was invalid * - Other spa_open failures * - 0 if the operation succeeded */ int lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, nvlist_t *vdevs, nvlist_t **errlist) { int error; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); fnvlist_free(args); return (error); } /* * Changes TRIM state. * * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. * The key is ignored. * * If there are errors related to vdev arguments, per-vdev errors are returned * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where * guid is stringified with PRIu64, and errno is one of the following as * an int64_t: * - ENODEV if the device was not found * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) * - EROFS if the device is not writeable * - EBUSY start requested but the device is already being either trimmed * or initialized * - ESRCH cancel/suspend requested but device is not being initialized * - EOPNOTSUPP if the device does not support TRIM (or secure TRIM) * * If the errlist is empty, then return value will be: * - EINVAL if one or more arguments was invalid * - Other spa_open failures * - 0 if the operation succeeded */ int lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate, boolean_t secure, nvlist_t *vdevs, nvlist_t **errlist) { int error; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, ZPOOL_TRIM_COMMAND, (uint64_t)cmd_type); fnvlist_add_nvlist(args, ZPOOL_TRIM_VDEVS, vdevs); fnvlist_add_uint64(args, ZPOOL_TRIM_RATE, rate); fnvlist_add_boolean_value(args, ZPOOL_TRIM_SECURE, secure); error = lzc_ioctl(ZFS_IOC_POOL_TRIM, poolname, args, errlist); fnvlist_free(args); return (error); } /* * Create a redaction bookmark named bookname by redacting snapshot with respect * to all the snapshots in snapnv. */ int lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv) { nvlist_t *args = fnvlist_alloc(); fnvlist_add_string(args, "bookname", bookname); fnvlist_add_nvlist(args, "snapnv", snapnv); int error = lzc_ioctl(ZFS_IOC_REDACT, snapshot, args, NULL); fnvlist_free(args); return (error); } static int wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { nvlist_t *args = fnvlist_alloc(); nvlist_t *result = NULL; fnvlist_add_int32(args, ZPOOL_WAIT_ACTIVITY, activity); if (use_tag) fnvlist_add_uint64(args, ZPOOL_WAIT_TAG, tag); int error = lzc_ioctl(ZFS_IOC_WAIT, pool, args, &result); if (error == 0 && waited != NULL) *waited = fnvlist_lookup_boolean_value(result, ZPOOL_WAIT_WAITED); fnvlist_free(args); fnvlist_free(result); return (error); } int lzc_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (wait_common(pool, activity, B_FALSE, 0, waited)); } int lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (wait_common(pool, activity, B_TRUE, tag, waited)); } int lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) { nvlist_t *args = fnvlist_alloc(); nvlist_t *result = NULL; fnvlist_add_int32(args, ZFS_WAIT_ACTIVITY, activity); int error = lzc_ioctl(ZFS_IOC_WAIT_FS, fs, args, &result); if (error == 0 && waited != NULL) *waited = fnvlist_lookup_boolean_value(result, ZFS_WAIT_WAITED); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Set the bootenv contents for the given pool. */ int lzc_set_bootenv(const char *pool, const nvlist_t *env) { return (lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, (nvlist_t *)env, NULL)); } /* * Get the contents of the bootenv of the given pool. */ int lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } + +/* + * Prune the specified amount from the pool's dedup table. + */ +int +lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit); + fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount); + + error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/man/Makefile.am b/man/Makefile.am index 194bb4721619..fde704933764 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -1,137 +1,138 @@ dist_noinst_man_MANS = \ %D%/man1/cstyle.1 dist_man_MANS = \ %D%/man1/arcstat.1 \ %D%/man1/raidz_test.1 \ %D%/man1/test-runner.1 \ %D%/man1/zhack.1 \ %D%/man1/ztest.1 \ %D%/man1/zvol_wait.1 \ \ %D%/man5/vdev_id.conf.5 \ \ %D%/man4/spl.4 \ %D%/man4/zfs.4 \ \ %D%/man7/dracut.zfs.7 \ %D%/man7/vdevprops.7 \ %D%/man7/zfsconcepts.7 \ %D%/man7/zfsprops.7 \ %D%/man7/zpool-features.7 \ %D%/man7/zpoolconcepts.7 \ %D%/man7/zpoolprops.7 \ \ %D%/man8/fsck.zfs.8 \ %D%/man8/mount.zfs.8 \ %D%/man8/vdev_id.8 \ %D%/man8/zdb.8 \ %D%/man8/zfs.8 \ %D%/man8/zfs-allow.8 \ %D%/man8/zfs-bookmark.8 \ %D%/man8/zfs-change-key.8 \ %D%/man8/zfs-clone.8 \ %D%/man8/zfs-create.8 \ %D%/man8/zfs-destroy.8 \ %D%/man8/zfs-diff.8 \ %D%/man8/zfs-get.8 \ %D%/man8/zfs-groupspace.8 \ %D%/man8/zfs-hold.8 \ %D%/man8/zfs-inherit.8 \ %D%/man8/zfs-list.8 \ %D%/man8/zfs-load-key.8 \ %D%/man8/zfs-mount.8 \ %D%/man8/zfs-program.8 \ %D%/man8/zfs-project.8 \ %D%/man8/zfs-projectspace.8 \ %D%/man8/zfs-promote.8 \ %D%/man8/zfs-receive.8 \ %D%/man8/zfs-recv.8 \ %D%/man8/zfs-redact.8 \ %D%/man8/zfs-release.8 \ %D%/man8/zfs-rename.8 \ %D%/man8/zfs-rollback.8 \ %D%/man8/zfs-send.8 \ %D%/man8/zfs-set.8 \ %D%/man8/zfs-share.8 \ %D%/man8/zfs-snapshot.8 \ %D%/man8/zfs-unallow.8 \ %D%/man8/zfs-unload-key.8 \ %D%/man8/zfs-unmount.8 \ %D%/man8/zfs-upgrade.8 \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ %D%/man8/zpool-add.8 \ %D%/man8/zpool-attach.8 \ %D%/man8/zpool-checkpoint.8 \ %D%/man8/zpool-clear.8 \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ + %D%/man8/zpool-ddtprune.8 \ %D%/man8/zpool-events.8 \ %D%/man8/zpool-export.8 \ %D%/man8/zpool-get.8 \ %D%/man8/zpool-history.8 \ %D%/man8/zpool-import.8 \ %D%/man8/zpool-initialize.8 \ %D%/man8/zpool-iostat.8 \ %D%/man8/zpool-labelclear.8 \ %D%/man8/zpool-list.8 \ %D%/man8/zpool-offline.8 \ %D%/man8/zpool-online.8 \ %D%/man8/zpool-prefetch.8 \ %D%/man8/zpool-reguid.8 \ %D%/man8/zpool-remove.8 \ %D%/man8/zpool-reopen.8 \ %D%/man8/zpool-replace.8 \ %D%/man8/zpool-resilver.8 \ %D%/man8/zpool-scrub.8 \ %D%/man8/zpool-set.8 \ %D%/man8/zpool-split.8 \ %D%/man8/zpool-status.8 \ %D%/man8/zpool-sync.8 \ %D%/man8/zpool-trim.8 \ %D%/man8/zpool-upgrade.8 \ %D%/man8/zpool-wait.8 \ %D%/man8/zstream.8 \ %D%/man8/zstreamdump.8 \ %D%/man8/zpool_influxdb.8 if BUILD_FREEBSD dist_man_MANS += \ %D%/man8/zfs-jail.8 \ %D%/man8/zfs-unjail.8 endif if BUILD_LINUX dist_man_MANS += \ %D%/man8/zfs-unzone.8 \ %D%/man8/zfs-zone.8 endif nodist_man_MANS = \ %D%/man8/zed.8 \ %D%/man8/zfs-mount-generator.8 \ %D%/man8/zfs_prepare_disk.8 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) SUBSTFILES += $(nodist_man_MANS) CHECKS += mancheck mancheck: $(top_srcdir)/scripts/mancheck.sh $(srcdir)/%D% if BUILD_LINUX # The manual pager in most Linux distros defaults to "BSD" when .Os is blank, # but leaving it blank makes things a lot easier on # FreeBSD when OpenZFS is vendored in the base system. INSTALL_DATA_HOOKS += man-install-data-hook man-install-data-hook: cd $(DESTDIR)$(mandir) && $(SED) $(ac_inplace) 's/^\.Os$$/.Os OpenZFS/' $(subst %D%/,,$(dist_man_MANS) $(nodist_man_MANS)) endif diff --git a/man/man8/zpool-ddtprune.8 b/man/man8/zpool-ddtprune.8 new file mode 100644 index 000000000000..1ab7d3982c3e --- /dev/null +++ b/man/man8/zpool-ddtprune.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara Inc. +.\" +.Dd June 17, 2024 +.Dt ZPOOL-DDTPRUNE 8 +.Os +. +.Sh NAME +.Nm zpool-ddtprune +.Nd Prunes the oldest entries from the single reference dedup table(s) +.Sh SYNOPSIS +.Nm zpool +.Cm ddtprune +.Fl d Ar days | Fl p Ar percentage +.Ar pool +.Sh DESCRIPTION +This command prunes older unique entries from the dedup table. +As a complement to the dedup quota feature, +.Sy ddtprune +allows removal of older non-duplicate entries to make room for +newer duplicate entries. +.Pp +The amount to prune can be based on a target percentage of the unique entries +or based on the age (i.e., every unique entry older than N days). +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index c55644d9ecea..02a258f66708 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -1,621 +1,622 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" .Dd February 14, 2024 .Dt ZPOOL 8 .Os . .Sh NAME .Nm zpool .Nd configure ZFS storage pools .Sh SYNOPSIS .Nm .Fl ?V .Nm .Cm version .Op Fl j .Nm .Cm subcommand .Op Ar arguments . .Sh DESCRIPTION The .Nm command configures ZFS storage pools. A storage pool is a collection of devices that provides physical storage and data replication for ZFS datasets. All datasets within a storage pool share the same space. See .Xr zfs 8 for information on managing datasets. .Pp For an overview of creating and managing ZFS storage pools see the .Xr zpoolconcepts 7 manual page. . .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Pp The .Nm command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported: .Bl -tag -width Ds .It Xo .Nm .Fl ?\& .Xc Displays a help message. .It Xo .Nm .Fl V , -version .Xc .It Xo .Nm .Cm version .Op Fl j .Xc Displays the software version of the .Nm userland utility and the ZFS kernel module. Use .Fl j option to output in JSON format. .El . .Ss Creation .Bl -tag -width Ds .It Xr zpool-create 8 Creates a new storage pool containing the virtual devices specified on the command line. .It Xr zpool-initialize 8 Begins initializing by writing to all unallocated regions on the specified devices, or all eligible devices in the pool if no individual devices are specified. .El . .Ss Destruction .Bl -tag -width Ds .It Xr zpool-destroy 8 Destroys the given pool, freeing up any devices for other use. .It Xr zpool-labelclear 8 Removes ZFS label information from the specified .Ar device . .El . .Ss Virtual Devices .Bl -tag -width Ds .It Xo .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8 .Xc Converts a non-redundant disk into a mirror, or increases the redundancy level of an existing mirror .Cm ( attach Ns ), or performs the inverse operation ( .Cm detach Ns ). .It Xo .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8 .Xc Adds the specified virtual devices to the given pool, or removes the specified device from the pool. .It Xr zpool-replace 8 Replaces an existing device (which may be faulted) with a new one. .It Xr zpool-split 8 Creates a new pool by splitting all mirrors in an existing pool (which decreases its redundancy). .El . .Ss Properties Available pool properties listed in the .Xr zpoolprops 7 manual page. .Bl -tag -width Ds .It Xr zpool-list 8 Lists the given pools along with a health status and space usage. .It Xo .Xr zpool-get 8 Ns / Ns Xr zpool-set 8 .Xc Retrieves the given list of properties .Po or all properties if .Sy all is used .Pc for the specified storage pool(s). .El . .Ss Monitoring .Bl -tag -width Ds .It Xr zpool-status 8 Displays the detailed health status for the given pools. .It Xr zpool-iostat 8 Displays logical I/O statistics for the given pools/vdevs. Physical I/O operations may be observed via .Xr iostat 1 . .It Xr zpool-events 8 Lists all recent events generated by the ZFS kernel modules. These events are consumed by the .Xr zed 8 and used to automate administrative tasks such as replacing a failed device with a hot spare. That manual page also describes the subclasses and event payloads that can be generated. .It Xr zpool-history 8 Displays the command history of the specified pool(s) or all pools if no pool is specified. .El . .Ss Maintenance .Bl -tag -width Ds .It Xr zpool-prefetch 8 Prefetches specific types of pool data. .It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. .It Xr zpool-checkpoint 8 Checkpoints the current state of .Ar pool , which can be later restored by .Nm zpool Cm import Fl -rewind-to-checkpoint . .It Xr zpool-trim 8 Initiates an immediate on-demand TRIM operation for all of the free space in a pool. This operation informs the underlying storage devices of all blocks in the pool which are no longer allocated and allows thinly provisioned devices to reclaim the space. .It Xr zpool-sync 8 This command forces all in-core dirty data to be written to the primary pool storage and not the ZIL. It will also update administrative information including quota reporting. Without arguments, .Nm zpool Cm sync will sync all pools on the system. Otherwise, it will sync only the specified pool(s). .It Xr zpool-upgrade 8 Manage the on-disk format version of storage pools. .It Xr zpool-wait 8 Waits until all background activity of the given types has ceased in the given pool. .El . .Ss Fault Resolution .Bl -tag -width Ds .It Xo .Xr zpool-offline 8 Ns / Ns Xr zpool-online 8 .Xc Takes the specified physical device offline or brings it online. .It Xr zpool-resilver 8 Starts a resilver. If an existing resilver is already running it will be restarted from the beginning. .It Xr zpool-reopen 8 Reopen all the vdevs associated with the pool. .It Xr zpool-clear 8 Clears device errors in a pool. .El . .Ss Import & Export .Bl -tag -width Ds .It Xr zpool-import 8 Make disks containing ZFS storage pools available for use on the system. .It Xr zpool-export 8 Exports the given pools from the system. .It Xr zpool-reguid 8 Generates a new unique identifier for the pool. .El . .Sh EXIT STATUS The following exit values are returned: .Bl -tag -compact -offset 4n -width "a" .It Sy 0 Successful completion. .It Sy 1 An error occurred. .It Sy 2 Invalid command line options were specified. .El . .Sh EXAMPLES .\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8. .\" Examples 6, 14 are shared with zpool-add.8. .\" Examples 7, 16 are shared with zpool-list.8. .\" Examples 8 are shared with zpool-destroy.8. .\" Examples 9 are shared with zpool-export.8. .\" Examples 10 are shared with zpool-import.8. .\" Examples 11 are shared with zpool-upgrade.8. .\" Examples 15 are shared with zpool-remove.8. .\" Examples 17 are shared with zpool-status.8. .\" Examples 14, 17 are also shared with zpool-iostat.8. .\" Make sure to update them omnidirectionally .Ss Example 1 : No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that consists of six disks: .Dl # Nm zpool Cm create Ar tank Sy raidz Pa sda sdb sdc sdd sde sdf . .Ss Example 2 : No Creating a Mirrored Storage Pool The following command creates a pool with two mirrors, where each mirror contains two disks: .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy mirror Pa sdc sdd . .Ss Example 3 : No Creating a ZFS Storage Pool by Using Partitions The following command creates a non-redundant pool using two disk partitions: .Dl # Nm zpool Cm create Ar tank Pa sda1 sdb2 . .Ss Example 4 : No Creating a ZFS Storage Pool by Using Files The following command creates a non-redundant pool using files. While not recommended, a pool based on files can be useful for experimental purposes. .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b . .Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored The following command converts an existing single device .Ar sda into a mirror by attaching a second device to it, .Ar sdb . .Dl # Nm zpool Cm attach Ar tank Pa sda sdb . .Ss Example 6 : No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool .Ar tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Dl # Nm zpool Cm add Ar tank Sy mirror Pa sda sdb . .Ss Example 7 : No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool .Ar zion is faulted due to a missing device. The results from this command are similar to the following: .Bd -literal -compact -offset Ds .No # Nm zpool Cm list NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT rpool 19.9G 8.43G 11.4G - 33% 42% 1.00x ONLINE - tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed . .Ss Example 8 : No Destroying a ZFS Storage Pool The following command destroys the pool .Ar tank and any datasets contained within: .Dl # Nm zpool Cm destroy Fl f Ar tank . .Ss Example 9 : No Exporting a ZFS Storage Pool The following command exports the devices in pool .Ar tank so that they can be relocated or later imported: .Dl # Nm zpool Cm export Ar tank . .Ss Example 10 : No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool .Ar tank for use on the system. The results from this command are similar to the following: .Bd -literal -compact -offset Ds .No # Nm zpool Cm import pool: tank id: 15451357997522795478 state: ONLINE action: The pool can be imported using its name or numeric identifier. config: tank ONLINE mirror ONLINE sda ONLINE sdb ONLINE .No # Nm zpool Cm import Ar tank .Ed . .Ss Example 11 : No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of the software: .Bd -literal -compact -offset Ds .No # Nm zpool Cm upgrade Fl a This system is currently running ZFS version 2. .Ed . .Ss Example 12 : No Managing Hot Spares The following command creates a new pool with an available hot spare: .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc .Pp If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command: .Dl # Nm zpool Cm replace Ar tank Pa sda sdd .Pp Once the data has been resilvered, the spare is automatically removed and is made available for use should another device fail. The hot spare can be permanently removed from the pool using the following command: .Dl # Nm zpool Cm remove Ar tank Pa sdc . .Ss Example 13 : No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf . .Ss Example 14 : No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd .Pp Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the .Cm iostat subcommand as follows: .Dl # Nm zpool Cm iostat Fl v Ar pool 5 . .Ss Example 15 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device .Sy mirror-1 . .Pp Given this configuration: .Bd -literal -compact -offset Ds pool: tank state: ONLINE scrub: none requested config: NAME STATE READ WRITE CKSUM tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 sda ONLINE 0 0 0 sdb ONLINE 0 0 0 mirror-1 ONLINE 0 0 0 sdc ONLINE 0 0 0 sdd ONLINE 0 0 0 logs mirror-2 ONLINE 0 0 0 sde ONLINE 0 0 0 sdf ONLINE 0 0 0 .Ed .Pp The command to remove the mirrored log .Ar mirror-2 No is : .Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 . .Ss Example 16 : No Displaying expanded space on a device The following command displays the detailed information for the pool .Ar data . This pool is comprised of a single raidz vdev where one of its devices increased its capacity by 10 GiB. In this example, the pool will not be able to utilize this extra capacity until all the devices under the raidz vdev have been expanded. .Bd -literal -compact -offset Ds .No # Nm zpool Cm list Fl v Ar data NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - raidz1 23.9G 14.6G 9.30G - 48% sda - - - - - sdb - - - 10G - sdc - - - - - .Ed . .Ss Example 17 : No Adding output columns Additional columns can be added to the .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c . .Bd -literal -compact -offset Ds .No # Nm zpool Cm status Fl c Pa vendor , Ns Pa model , Ns Pa size NAME STATE READ WRITE CKSUM vendor model size tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 U1 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U10 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U11 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U12 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U13 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U14 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T .No # Nm zpool Cm iostat Fl vc Pa size capacity operations bandwidth pool alloc free read write read write size ---------- ----- ----- ----- ----- ----- ----- ---- rpool 14.6G 54.9G 4 55 250K 2.69M sda1 14.6G 54.9G 4 55 250K 2.69M 70G ---------- ----- ----- ----- ----- ----- ----- ---- .Ed . .Sh ENVIRONMENT VARIABLES .Bl -tag -compact -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE" .It Sy ZFS_ABORT Cause .Nm to dump core on exit for the purposes of running .Sy ::findleaks . .It Sy ZFS_COLOR Use ANSI color in .Nm zpool Cm status and .Nm zpool Cm iostat output. .It Sy ZPOOL_AUTO_POWER_ON_SLOT Automatically attempt to turn on the drives enclosure slot power to a drive when running the .Nm zpool Cm online or .Nm zpool Cm clear commands. This has the same effect as passing the .Fl -power option to those commands. .It Sy ZPOOL_POWER_ON_SLOT_TIMEOUT_MS The maximum time in milliseconds to wait for a slot power sysfs value to return the correct value after writing it. For example, after writing "on" to the sysfs enclosure slot power_control file, it can take some time for the enclosure to power down the slot and return "on" if you read back the 'power_control' value. Defaults to 30 seconds (30000ms) if not set. .It Sy ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. This is a colon-separated list of directories in which .Nm looks for device nodes and files. Similar to the .Fl d option in .Nm zpool import . .It Sy ZPOOL_IMPORT_UDEV_TIMEOUT_MS The maximum time in milliseconds that .Nm zpool import will wait for an expected device to be available. .It Sy ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE If set, suppress warning about non-native vdev ashift in .Nm zpool Cm status . The value is not used, only the presence or absence of the variable matters. .It Sy ZPOOL_VDEV_NAME_GUID Cause .Nm subcommands to output vdev guids by default. This behavior is identical to the .Nm zpool Cm status Fl g command line option. .It Sy ZPOOL_VDEV_NAME_FOLLOW_LINKS Cause .Nm subcommands to follow links for vdev names by default. This behavior is identical to the .Nm zpool Cm status Fl L command line option. .It Sy ZPOOL_VDEV_NAME_PATH Cause .Nm subcommands to output full vdev path names by default. This behavior is identical to the .Nm zpool Cm status Fl P command line option. .It Sy ZFS_VDEV_DEVID_OPT_OUT Older OpenZFS implementations had issues when attempting to display pool config vdev names if a .Sy devid NVP value is present in the pool's config. .Pp For example, a pool that originated on illumos platform would have a .Sy devid value in the config and .Nm zpool Cm status would fail when listing the config. This would also be true for future Linux-based pools. .Pp A pool can be stripped of any .Sy devid values on import or prevented from adding them on .Nm zpool Cm create or .Nm zpool Cm add by setting .Sy ZFS_VDEV_DEVID_OPT_OUT . .Pp .It Sy ZPOOL_SCRIPTS_AS_ROOT Allow a privileged user to run .Nm zpool Cm status Ns / Ns Cm iostat Fl c . Normally, only unprivileged users are allowed to run .Fl c . .It Sy ZPOOL_SCRIPTS_PATH The search path for scripts when running .Nm zpool Cm status Ns / Ns Cm iostat Fl c . This is a colon-separated list of directories and overrides the default .Pa ~/.zpool.d and .Pa /etc/zfs/zpool.d search paths. .It Sy ZPOOL_SCRIPTS_ENABLED Allow a user to run .Nm zpool Cm status Ns / Ns Cm iostat Fl c . If .Sy ZPOOL_SCRIPTS_ENABLED is not set, it is assumed that the user is allowed to run .Nm zpool Cm status Ns / Ns Cm iostat Fl c . .\" Shared with zfs.8 .It Sy ZFS_MODULE_TIMEOUT Time, in seconds, to wait for .Pa /dev/zfs to appear. Defaults to .Sy 10 , max .Sy 600 Pq 10 minutes . If .Pf < Sy 0 , wait forever; if .Sy 0 , don't wait. .El . .Sh INTERFACE STABILITY .Sy Evolving . .Sh SEE ALSO .Xr zfs 4 , .Xr zpool-features 7 , .Xr zpoolconcepts 7 , .Xr zpoolprops 7 , .Xr zed 8 , .Xr zfs 8 , .Xr zpool-add 8 , .Xr zpool-attach 8 , .Xr zpool-checkpoint 8 , .Xr zpool-clear 8 , .Xr zpool-create 8 , +.Xr zpool-ddtprune 8 , .Xr zpool-destroy 8 , .Xr zpool-detach 8 , .Xr zpool-events 8 , .Xr zpool-export 8 , .Xr zpool-get 8 , .Xr zpool-history 8 , .Xr zpool-import 8 , .Xr zpool-initialize 8 , .Xr zpool-iostat 8 , .Xr zpool-labelclear 8 , .Xr zpool-list 8 , .Xr zpool-offline 8 , .Xr zpool-online 8 , .Xr zpool-prefetch 8 , .Xr zpool-reguid 8 , .Xr zpool-remove 8 , .Xr zpool-reopen 8 , .Xr zpool-replace 8 , .Xr zpool-resilver 8 , .Xr zpool-scrub 8 , .Xr zpool-set 8 , .Xr zpool-split 8 , .Xr zpool-status 8 , .Xr zpool-sync 8 , .Xr zpool-trim 8 , .Xr zpool-upgrade 8 , .Xr zpool-wait 8 diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 11fd10fb769d..0e12e7e49828 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1,2408 +1,2776 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2022 by Pawel Jakub Dawidek * Copyright (c) 2019, 2023, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * # DDT: Deduplication tables * * The dedup subsystem provides block-level deduplication. When enabled, blocks * to be written will have the dedup (D) bit set, which causes them to be * tracked in a "dedup table", or DDT. If a block has been seen before (exists * in the DDT), instead of being written, it will instead be made to reference * the existing on-disk data, and a refcount bumped in the DDT instead. * * ## Dedup tables and entries * * Conceptually, a DDT is a dictionary or map. Each entry has a "key" * (ddt_key_t) made up a block's checksum and certian properties, and a "value" * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth * time and refcount. Together these are enough to track references to a * specific block, to build a valid block pointer to reference that block (for * freeing, scrubbing, etc), and to fill a new block pointer with the missing * pieces to make it seem like it was written. * * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk * object data formats, each with their own implementations) and "classes" * (ddt_class_t, instance of a storage type object, for entries with a specific * characteristic). An entry (key) will only ever exist on one of these objects * at any given time, but may be moved from one to another if their type or * class changes. * * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block * is to be written, before DVAs have been allocated, ddt_lookup() is called to * see if the block has been seen before. If its not found, the write proceeds * as normal, and after it succeeds, a new entry is created. If it is found, we * fill the BP with the DVAs from the entry, increment the refcount and cause * the write IO to return immediately. * * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup * block for the same content/checksum. The slot is selected based on the * zp_copies parameter the block is written with, that is, the number of DVAs * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for * now-removed "dedupditto" feature. These are no longer written, and will be * freed if encountered on old pools. * * If the "fast_dedup" feature is enabled, new dedup tables will be created * with the "flat phys" option. In this mode, there is only one ddt_phys_t * slot. If a write is issued for an entry that exists, but has fewer DVAs, * then only as many new DVAs are allocated and written to make up the * shortfall. The existing entry is then extended (ddt_phys_extend()) with the * new DVAs. * * ## Lifetime of an entry * * A DDT can be enormous, and typically is not held in memory all at once. * Instead, the changes to an entry are tracked in memory, and written down to * disk at the end of each txg. * * A "live" in-memory entry (ddt_entry_t) is a node on the live tree * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is * required for IO, ddt_lookup() is called. If an entry already exists on * ddt_tree, it is returned. Otherwise, a new one is created, and the * type/class objects for the DDT are searched for that key. If its found, its * value is copied into the live entry. If not, an empty entry is created. * * The live entry will be modified during the txg, usually by modifying the * refcount, but sometimes by adding or updating DVAs. At the end of the txg * (during spa_sync()), type and class are recalculated for entry (see * ddt_sync_entry()), and the entry is written to the appropriate storage * object and (if necessary), removed from an old one. ddt_tree is cleared and * the next txg can start. * * ## Dedup quota * * A maximum size for all DDTs on the pool can be set with the * dedup_table_quota property. This is determined in ddt_over_quota() and * enforced during ddt_lookup(). If the pool is at or over its quota limit, * ddt_lookup() will only return entries for existing blocks, as updates are * still possible. New entries will not be created; instead, ddt_lookup() will * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove * the D bit on the block and reissue the IO as a regular write. The block will * not be deduplicated. * * Note that this is based on the on-disk size of the dedup store. Reclaiming * this space after deleting entries relies on the ZAP "shrinking" behaviour, * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup table pruning + * + * As a complement to the dedup quota feature, ddtprune allows removal of older + * non-duplicate entries to make room for newer duplicate entries. The amount + * to prune can be based on a target percentage of the unique entries or based + * on the age (i.e., prune unique entry older than N days). + * * ## Dedup log * * Historically, all entries modified on a txg were written back to dedup * storage objects at the end of every txg. This could cause significant * overheads, as each entry only takes up a tiny portion of a ZAP leaf node, * and so required reading the whole node, updating the entry, and writing it * back. On busy pools, this could add serious IO and memory overheads. * * To address this, the dedup log was added. If the "fast_dedup" feature is * enabled, at the end of each txg, modified entries will be copied to an * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the * same block is requested again, the in-memory object will be checked first, * and if its there, the entry inflated back onto the live tree without going * to storage. The on-disk log is only read at pool import time, to reload the * in-memory log. * * Each txg, some amount of the in-memory log will be flushed out to a DDT * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to * keep up with the rate of change on dedup entries, but not so much that it * would impact overall throughput, and not using too much memory. See the * zfs_dedup_log_* tuneables in zfs(4) for more details. * * ## Repair IO * * If a read on a dedup block fails, but there are other copies of the block in * the other ddt_phys_t slots, reads will be issued for those instead * (zio_ddt_read_start()). If one of those succeeds, the read is returned to * the caller, and a copy is stashed on the entry's dde_repair_abd. * * During the end-of-txg sync, any entries with a dde_repair_abd get a * "rewrite" write issued for the original block pointer, with the data read * from the alternate block. If the block is actually damaged, this will invoke * the pool's "self-healing" mechanism, and repair the block. * * If the "fast_dedup" feature is enabled, the "flat phys" option will be in * use, so there is only ever one ddt_phys_t slot. The repair process will * still happen in this case, though it is unlikely to succeed as there will * usually be no other equivalent blocks to fall back on (though there might * be, if this was an early version of a dedup'd block that has since been * extended). * * Note that this repair mechanism is in addition to and separate from the * regular OpenZFS scrub and self-healing mechanisms. * * ## Scanning (scrub/resilver) * * If dedup is active, the scrub machinery will walk the dedup table first, and * scrub all blocks with refcnt > 1 first. After that it will move on to the * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. * In this way, heavily deduplicated blocks are only scrubbed once. See the * commentary on dsl_scan_ddt() for more details. * * Walking the DDT is done via ddt_walk(). The current position is stored in a * ddt_bookmark_t, which represents a stable position in the storage object. * This bookmark is stored by the scan machinery, and must reference the same * position on the object even if the object changes, the pool is exported, or * OpenZFS is upgraded. * * If the "fast_dedup" feature is enabled and the table has a log, the scan * cannot begin until entries on the log are flushed, as the on-disk log has no * concept of a "stable position". Instead, the log flushing process will enter * a more aggressive mode, to flush out as much as is necesary as soon as * possible, in order to begin the scan as soon as possible. * * ## Interaction with block cloning * * If block cloning and dedup are both enabled on a pool, BRT will look for the * dedup bit on an incoming block pointer. If set, it will call into the DDT * (ddt_addref()) to add a reference to the block, instead of adding a * reference to the BRT. See brt_pending_apply(). */ /* * These are the only checksums valid for dedup. They must match the list * from dedup_table in zfs_prop.c */ #define DDT_CHECKSUM_VALID(c) \ (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ c == ZIO_CHECKSUM_BLAKE3) static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_flat_cache; static kmem_cache_t *ddt_entry_trad_cache; #define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) #define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) #define DDT_ENTRY_SIZE(ddt) \ _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. */ int zfs_dedup_prefetch = 0; /* * If the dedup class cannot satisfy a DDT allocation, treat as over quota * for this many TXGs. */ uint_t dedup_class_wait_txgs = 5; +/* + * How many DDT prune entries to add to the DDT sync AVL tree. + * Note these addtional entries have a memory footprint of a + * ddt_entry_t (216 bytes). + */ +static uint32_t zfs_ddt_prunes_per_txg = 50000; + +/* + * For testing, synthesize aged DDT entries + * (in global scope for ztest) + */ +boolean_t ddt_prune_artificial_age = B_FALSE; +boolean_t ddt_dump_prune_histogram = B_FALSE; /* * Don't do more than this many incremental flush passes per txg. */ uint_t zfs_dedup_log_flush_passes_max = 8; /* * Minimum time to flush per txg. */ uint_t zfs_dedup_log_flush_min_time_ms = 1000; /* * Minimum entries to flush per txg. */ uint_t zfs_dedup_log_flush_entries_min = 1000; /* * Number of txgs to average flow rates across. */ uint_t zfs_dedup_log_flush_flow_rate_txgs = 10; static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; static const char *const ddt_class_name[DDT_CLASSES] = { "ditto", "duplicate", "unique", }; /* * DDT feature flags automatically enabled for each on-disk version. Note that * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. */ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_LEGACY] = 0, [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, }; -/* Dummy version to signal that configure is still necessary */ -#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) - -#ifdef _KERNEL /* per-DDT kstats */ typedef struct { /* total lookups and whether they returned new or existing entries */ kstat_named_t dds_lookup; kstat_named_t dds_lookup_new; kstat_named_t dds_lookup_existing; /* entries found on live tree, and if we had to wait for load */ kstat_named_t dds_lookup_live_hit; kstat_named_t dds_lookup_live_wait; kstat_named_t dds_lookup_live_miss; /* entries found on log trees */ kstat_named_t dds_lookup_log_hit; kstat_named_t dds_lookup_log_active_hit; kstat_named_t dds_lookup_log_flushing_hit; kstat_named_t dds_lookup_log_miss; /* entries found on store objects */ kstat_named_t dds_lookup_stored_hit; kstat_named_t dds_lookup_stored_miss; /* number of entries on log trees */ kstat_named_t dds_log_active_entries; kstat_named_t dds_log_flushing_entries; /* avg updated/flushed entries per txg */ kstat_named_t dds_log_ingest_rate; kstat_named_t dds_log_flush_rate; kstat_named_t dds_log_flush_time_rate; } ddt_kstats_t; static const ddt_kstats_t ddt_kstats_template = { { "lookup", KSTAT_DATA_UINT64 }, { "lookup_new", KSTAT_DATA_UINT64 }, { "lookup_existing", KSTAT_DATA_UINT64 }, { "lookup_live_hit", KSTAT_DATA_UINT64 }, { "lookup_live_wait", KSTAT_DATA_UINT64 }, { "lookup_live_miss", KSTAT_DATA_UINT64 }, { "lookup_log_hit", KSTAT_DATA_UINT64 }, { "lookup_log_active_hit", KSTAT_DATA_UINT64 }, { "lookup_log_flushing_hit", KSTAT_DATA_UINT64 }, { "lookup_log_miss", KSTAT_DATA_UINT64 }, { "lookup_stored_hit", KSTAT_DATA_UINT64 }, { "lookup_stored_miss", KSTAT_DATA_UINT64 }, { "log_active_entries", KSTAT_DATA_UINT64 }, { "log_flushing_entries", KSTAT_DATA_UINT64 }, { "log_ingest_rate", KSTAT_DATA_UINT32 }, { "log_flush_rate", KSTAT_DATA_UINT32 }, { "log_flush_time_rate", KSTAT_DATA_UINT32 }, }; +#ifdef _KERNEL #define _DDT_KSTAT_STAT(ddt, stat) \ &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64 #define DDT_KSTAT_BUMP(ddt, stat) \ do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0) #define DDT_KSTAT_ADD(ddt, stat, val) \ do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) #define DDT_KSTAT_SUB(ddt, stat, val) \ do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) #define DDT_KSTAT_SET(ddt, stat, val) \ do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) #define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0) #else #define DDT_KSTAT_BUMP(ddt, stat) do {} while (0) #define DDT_KSTAT_ADD(ddt, stat, val) do {} while (0) #define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0) #define DDT_KSTAT_SET(ddt, stat, val) do {} while (0) #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0) #endif /* _KERNEL */ + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; ASSERT3U(ddt->ddt_dir_object, >, 0); ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, ==, 0); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, objectp, tx)); VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class], tx)); } static void ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; uint64_t count; char name[DDT_NAMELEN]; ASSERT3U(ddt->ddt_dir_object, >, 0); ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY0(ddt_object_count(ddt, type, class, &count)); VERIFY0(count); VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); *objectp = 0; } static int ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; uint64_t count; char name[DDT_NAMELEN]; int error; if (ddt->ddt_dir_object == 0) { /* * If we're configured but the containing dir doesn't exist * yet, then this object can't possibly exist either. */ ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); return (SET_ERROR(ENOENT)); } ddt_object_name(ddt, type, class, name); error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error != 0) return (error); error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class]); if (error != 0) return (error); /* * Seed the cached statistics. */ error = ddt_object_info(ddt, type, class, &doi); if (error) return (error); error = ddt_object_count(ddt, type, class, &count); if (error) return (error); ddo->ddo_count = count; ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; return (0); } static void ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; uint64_t count; char name[DDT_NAMELEN]; ddt_object_name(ddt, type, class, name); VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class], tx)); /* * Cache DDT statistics; this is the only time they'll change. */ VERIFY0(ddt_object_info(ddt, type, class, &doi)); VERIFY0(ddt_object_count(ddt, type, class, &count)); ddo->ddo_count = count; ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; } static boolean_t ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { return (!!ddt->ddt_object[type][class]); } static int ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) return (SET_ERROR(ENOENT)); return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, DDT_PHYS_SIZE(ddt))); } static int ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, const ddt_key_t *ddk) { if (!ddt_object_exists(ddt, type, class)) return (SET_ERROR(ENOENT)); return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, ddt->ddt_object[type][class], ddk)); } static void ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, const ddt_key_t *ddk) { if (!ddt_object_exists(ddt, type, class)) return; ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, ddt->ddt_object[type][class], ddk); } static void ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { if (!ddt_object_exists(ddt, type, class)) return; ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os, ddt->ddt_object[type][class]); } static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, ddt->ddt_object[type][class], &ddlwe->ddlwe_key, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx)); } static int ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, const ddt_key_t *ddk, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, ddt->ddt_object[type][class], ddk, tx)); } int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *walk, ddt_lightweight_entry_t *ddlwe) { ASSERT(ddt_object_exists(ddt, type, class)); int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); if (error == 0) { ddlwe->ddlwe_type = type; ddlwe->ddlwe_class = class; return (0); } return (error); } int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *count) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, ddt->ddt_object[type][class], count)); } int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_object_info_t *doi) { if (!ddt_object_exists(ddt, type, class)) return (SET_ERROR(ENOENT)); return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], doi)); } void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, char *name) { (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, zio_checksum_table[ddt->ddt_checksum].ci_name, ddt_ops[type]->ddt_op_name, ddt_class_name[class]); } void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp, uint64_t txg) { ASSERT3U(txg, !=, 0); ASSERT3U(v, <, DDT_PHYS_NONE); uint64_t phys_birth; const dva_t *dvap; if (v == DDT_PHYS_FLAT) { phys_birth = ddp->ddp_flat.ddp_phys_birth; dvap = ddp->ddp_flat.ddp_dva; } else { phys_birth = ddp->ddp_trad[v].ddp_phys_birth; dvap = ddp->ddp_trad[v].ddp_dva; } for (int d = 0; d < SPA_DVAS_PER_BP; d++) bp->blk_dva[d] = dvap[d]; BP_SET_BIRTH(bp, txg, phys_birth); } /* * The bp created via this function may be used for repairs and scrub, but it * will be missing the salt / IV required to do a full decrypting read. */ void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); bp->blk_cksum = ddk->ddk_cksum; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); BP_SET_FILL(bp, 1); BP_SET_CHECKSUM(bp, checksum); BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 1); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); } void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) { ddk->ddk_cksum = bp->blk_cksum; ddk->ddk_prop = 0; ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); } void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) { ASSERT3U(v, <, DDT_PHYS_NONE); int bp_ndvas = BP_GET_NDVAS(bp); int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; dva_t *dvas = (v == DDT_PHYS_FLAT) ? ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; int s = 0, d = 0; while (s < bp_ndvas && d < ddp_max_dvas) { if (DVA_IS_VALID(&dvas[d])) { d++; continue; } dvas[d] = bp->blk_dva[s]; s++; d++; } /* * If the caller offered us more DVAs than we can fit, something has * gone wrong in their accounting. zio_ddt_write() should never ask for * more than we need. */ ASSERT3U(s, ==, bp_ndvas); if (BP_IS_ENCRYPTED(bp)) dvas[2] = bp->blk_dva[2]; if (ddt_phys_birth(ddp, v) == 0) { if (v == DDT_PHYS_FLAT) ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); else ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); } } void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); if (v == DDT_PHYS_FLAT) dst->ddp_flat = src->ddp_flat; else dst->ddp_trad[v] = src->ddp_trad[v]; } void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); if (v == DDT_PHYS_FLAT) memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); else memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } +static uint64_t +ddt_class_start(void) +{ + uint64_t start = gethrestime_sec(); + + if (ddt_prune_artificial_age) { + /* + * debug aide -- simulate a wider distribution + * so we don't have to wait for an aged DDT + * to test prune. + */ + int range = 1 << 21; + int percent = random_in_range(100); + if (percent < 50) { + range = range >> 4; + } else if (percent > 75) { + range /= 2; + } + start -= random_in_range(range); + } + + return (start); +} + void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); if (v == DDT_PHYS_FLAT) ddp->ddp_flat.ddp_refcnt++; else ddp->ddp_trad[v].ddp_refcnt++; } uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); uint64_t *refcntp; if (v == DDT_PHYS_FLAT) refcntp = &ddp->ddp_flat.ddp_refcnt; else refcntp = &ddp->ddp_trad[v].ddp_refcnt; ASSERT3U(*refcntp, >, 0); (*refcntp)--; return (*refcntp); } static void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, ddt_phys_variant_t v, uint64_t txg) { blkptr_t blk; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); /* * We clear the dedup bit so that zio_free() will actually free the * space, rather than just decrementing the refcount in the DDT. */ BP_SET_DEDUP(&blk, 0); ddt_phys_clear(ddp, v); zio_free(ddt->ddt_spa, txg, &blk); } uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); if (v == DDT_PHYS_FLAT) return (ddp->ddp_flat.ddp_phys_birth); else return (ddp->ddp_trad[v].ddp_phys_birth); } int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted) { ASSERT3U(v, <, DDT_PHYS_NONE); const dva_t *dvas = (v == DDT_PHYS_FLAT) ? ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; return (DVA_IS_VALID(&dvas[0]) + DVA_IS_VALID(&dvas[1]) + DVA_IS_VALID(&dvas[2]) * !encrypted); } ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { if (dde == NULL) return (DDT_PHYS_NONE); const ddt_univ_phys_t *ddp = dde->dde_phys; if (ddt->ddt_flags & DDT_FLAG_FLAT) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { return (DDT_PHYS_FLAT); } } else /* traditional phys */ { for (int p = 0; p < DDT_PHYS_MAX; p++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_trad[p].ddp_dva[0]) && BP_GET_BIRTH(bp) == ddp->ddp_trad[p].ddp_phys_birth) { return (p); } } } return (DDT_PHYS_NONE); } uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { ASSERT3U(v, <, DDT_PHYS_NONE); if (v == DDT_PHYS_FLAT) return (ddp->ddp_flat.ddp_refcnt); else return (ddp->ddp_trad[v].ddp_refcnt); } uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp) { uint64_t refcnt = 0; if (ddt->ddt_flags & DDT_FLAG_FLAT) refcnt = ddp->ddp_flat.ddp_refcnt; else for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++) refcnt += ddp->ddp_trad[v].ddp_refcnt; return (refcnt); } ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); } void ddt_enter(ddt_t *ddt) { mutex_enter(&ddt->ddt_lock); } void ddt_exit(ddt_t *ddt) { mutex_exit(&ddt->ddt_lock); } void ddt_init(void) { ddt_cache = kmem_cache_create("ddt_cache", sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_log_init(); } void ddt_fini(void) { ddt_log_fini(); kmem_cache_destroy(ddt_entry_trad_cache); kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); } static ddt_entry_t * ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) { ddt_entry_t *dde; if (ddt->ddt_flags & DDT_FLAG_FLAT) { dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); memset(dde, 0, DDT_ENTRY_FLAT_SIZE); } else { dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); memset(dde, 0, DDT_ENTRY_TRAD_SIZE); } cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; return (dde); } void ddt_alloc_entry_io(ddt_entry_t *dde) { if (dde->dde_io != NULL) return; dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP); } static void ddt_free(const ddt_t *ddt, ddt_entry_t *dde) { if (dde->dde_io != NULL) { for (int p = 0; p < DDT_NPHYS(ddt); p++) ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL); if (dde->dde_io->dde_repair_abd != NULL) abd_free(dde->dde_io->dde_repair_abd); kmem_free(dde->dde_io, sizeof (ddt_entry_io_t)); } cv_destroy(&dde->dde_cv); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? ddt_entry_flat_cache : ddt_entry_trad_cache, dde); } void ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); /* Entry is still in the log, so charge the entry back to it */ if (dde->dde_flags & DDE_FLAG_LOGGED) { ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); } avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); } static boolean_t ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc) { if (mc != NULL && metaslab_class_get_space(mc) > 0) { /* Over quota if allocating outside of this special class */ if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + dedup_class_wait_txgs) { /* Waiting for some deferred frees to be processed */ return (B_TRUE); } /* * We're considered over quota when we hit 85% full, or for * larger drives, when there is less than 8GB free. */ uint64_t allocated = metaslab_class_get_alloc(mc); uint64_t capacity = metaslab_class_get_space(mc); uint64_t limit = MAX(capacity * 85 / 100, (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); return (allocated >= limit); } return (B_FALSE); } /* * Check if the DDT is over its quota. This can be due to a few conditions: * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize * exceeds this limit * * 2. 'dedup_table_quota' property is set to automatic and * a. the dedup or special allocation class could not satisfy a DDT * allocation in a recent transaction * b. the dedup or special allocation class has exceeded its 85% limit */ static boolean_t ddt_over_quota(spa_t *spa) { if (spa->spa_dedup_table_quota == 0) return (B_FALSE); if (spa->spa_dedup_table_quota != UINT64_MAX) return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); /* * For automatic quota, table size is limited by dedup or special class */ if (ddt_special_over_quota(spa, spa_dedup_class(spa))) return (B_TRUE); else if (spa_special_has_ddt(spa) && ddt_special_over_quota(spa, spa_special_class(spa))) return (B_TRUE); return (B_FALSE); } void ddt_prefetch_all(spa_t *spa) { /* * Load all DDT entries for each type/class combination. This is * indended to perform a prefetch on all such blocks. For the same * reason that ddt_prefetch isn't locked, this is also not locked. */ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (!ddt) continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_object_prefetch_all(ddt, type, class); } } } } static int ddt_configure(ddt_t *ddt, boolean_t new); +/* + * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them + * to the ones in the entry. If they're different, then the passed-in BP is + * from a previous generation of this entry (ie was previously pruned) and we + * have to act like the entry doesn't exist at all. + * + * This should only happen during a lookup to free the block (zio_ddt_free()). + * + * XXX this is similar in spirit to ddt_phys_select(), maybe can combine + * -- robn, 2024-02-09 + */ +static boolean_t +ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) +{ + /* If the BP has no DVAs, then this entry is good */ + uint_t ndvas = BP_GET_NDVAS(bp); + if (ndvas == 0) + return (B_TRUE); + + /* + * Only checking the phys for the copies. For flat, there's only one; + * for trad it'll be the one that has the matching set of DVAs. + */ + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + dde->dde_phys->ddp_flat.ddp_dva : + dde->dde_phys->ddp_trad[ndvas].ddp_dva; + + /* + * Compare entry DVAs with the BP. They should all be there, but + * there's not really anything we can do if its only partial anyway, + * that's an error somewhere else, maybe long ago. + */ + uint_t d; + for (d = 0; d < ndvas; d++) + if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d])) + return (B_FALSE); + ASSERT3U(d, ==, ndvas); + + return (B_TRUE); +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; ddt_entry_t *dde; ddt_type_t type; ddt_class_t class; avl_index_t where; int error; ASSERT(MUTEX_HELD(&ddt->ddt_lock)); if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { /* * This is the first use of this DDT since the pool was * created; finish getting it ready for use. */ VERIFY0(ddt_configure(ddt, B_TRUE)); ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); } DDT_KSTAT_BUMP(ddt, dds_lookup); ddt_key_fill(&search, bp); /* Find an existing live entry */ dde = avl_find(&ddt->ddt_tree, &search, &where); if (dde != NULL) { /* If we went over quota, act like we didn't find it */ if (dde->dde_flags & DDE_FLAG_OVERQUOTA) return (NULL); /* If it's already loaded, we can just return it. */ DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); - if (dde->dde_flags & DDE_FLAG_LOADED) - return (dde); + if (dde->dde_flags & DDE_FLAG_LOADED) { + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); + } /* Someone else is loading it, wait for it. */ dde->dde_waiters++; DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait); while (!(dde->dde_flags & DDE_FLAG_LOADED)) cv_wait(&dde->dde_cv, &ddt->ddt_lock); dde->dde_waiters--; /* Loaded but over quota, forget we were ever here */ if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); } return (NULL); } DDT_KSTAT_BUMP(ddt, dds_lookup_existing); - return (dde); + + /* Make sure the loaded entry matches the BP */ + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); } else DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss); /* Time to make a new entry. */ dde = ddt_alloc(ddt, &search); /* Record the time this class was created (used by ddt prune) */ if (ddt->ddt_flags & DDT_FLAG_FLAT) - dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); avl_insert(&ddt->ddt_tree, dde, where); /* If its in the log tree, we can "load" it from there */ if (ddt->ddt_flags & DDT_FLAG_LOG) { ddt_lightweight_entry_t ddlwe; - boolean_t found = B_FALSE; - - if (ddt_log_take_key(ddt, ddt->ddt_log_active, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); - found = B_TRUE; - } else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit); - found = B_TRUE; - } - - if (found) { - dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + if (ddt_log_find_key(ddt, &search, &ddlwe)) { + /* + * See if we have the key first, and if so, set up + * the entry. + */ dde->dde_type = ddlwe.ddlwe_type; dde->dde_class = ddlwe.ddlwe_class; memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, DDT_PHYS_SIZE(ddt)); + /* Whatever we found isn't valid for this BP, eject */ + if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + + /* Remove it and count it */ + if (ddt_log_remove_key(ddt, + ddt->ddt_log_active, &search)) { + DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); + } else { + VERIFY(ddt_log_remove_key(ddt, + ddt->ddt_log_flushing, &search)); + DDT_KSTAT_BUMP(ddt, + dds_lookup_log_flushing_hit); + } + + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); return (dde); } DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss); } /* * ddt_tree is now stable, so unlock and let everyone else keep moving. * Anyone landing on this entry will find it without DDE_FLAG_LOADED, * and go to sleep waiting for it above. */ ddt_exit(ddt); /* Search all store objects for the entry. */ error = ENOENT; for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); if (error != ENOENT) { ASSERT0(error); break; } } if (error != ENOENT) break; } ddt_enter(ddt); ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + boolean_t valid = B_TRUE; + if (dde->dde_type == DDT_TYPES && dde->dde_class == DDT_CLASSES && ddt_over_quota(spa)) { /* Over quota. If no one is waiting, clean up right now. */ if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); return (NULL); } /* Flag cleanup required */ dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { + /* + * If what we loaded is no good for this BP and there's no one + * waiting for it, we can just remove it and get out. If its no + * good but there are waiters, we have to leave it, because we + * don't know what they want. If its not needed we'll end up + * taking an entry log/sync, but it can only happen if more + * than one previous version of this block is being deleted at + * the same time. This is extremely unlikely to happen and not + * worth the effort to deal with without taking an entry + * update. + */ + valid = ddt_entry_lookup_is_valid(ddt, bp, dde); + if (!valid && dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); /* * The histograms only track inactive (stored or logged) blocks. * We've just put an entry onto the live list, so we need to * remove its counts. When its synced back, it'll be re-added * to the right one. * * We only do this when we successfully found it in the store. * error == ENOENT means this is a new entry, and so its already * not counted. */ ddt_histogram_t *ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); ddt_histogram_sub_entry(ddt, ddh, &ddlwe); } else { DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss); DDT_KSTAT_BUMP(ddt, dds_lookup_new); } /* Entry loaded, everyone can proceed now */ dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); - return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); + if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid) + return (NULL); + + return (dde); } void ddt_prefetch(spa_t *spa, const blkptr_t *bp) { ddt_t *ddt; ddt_key_t ddk; if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) return; /* * We only remove the DDT once all tables are empty and only * prefetch dedup blocks when there are entries in the DDT. * Thus no locking is required as the DDT can't disappear on us. */ ddt = ddt_select(spa, bp); ddt_key_fill(&ddk, bp); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_object_prefetch(ddt, type, class, &ddk); } } } /* * ddt_key_t comparison. Any struct wanting to make use of this function must * have the key as the first element. Casts it to N uint64_ts, and checks until * we find there's a difference. This is intended to match how ddt_zap.c drives * the ZAPs (first uint64_t as the key prehash), which will minimise the number * of ZAP blocks touched when flushing logged entries from an AVL walk. This is * not an invariant for this function though, should you wish to change it. */ int ddt_key_compare(const void *x1, const void *x2) { const uint64_t *k1 = (const uint64_t *)x1; const uint64_t *k2 = (const uint64_t *)x2; int cmp; for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++) if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0)) return (cmp); return (0); } /* Create the containing dir for this DDT and bump the feature count */ static void ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) { ASSERT3U(ddt->ddt_dir_object, ==, 0); ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); char name[DDT_NAMELEN]; snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, zio_checksum_table[ddt->ddt_checksum].ci_name); ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, sizeof (uint64_t), 1, &ddt->ddt_version, tx)); VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); } /* Destroy the containing dir and deactivate the feature */ static void ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) { ASSERT3U(ddt->ddt_dir_object, !=, 0); ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); char name[DDT_NAMELEN]; snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, zio_checksum_table[ddt->ddt_checksum].ci_name); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ASSERT(!ddt_object_exists(ddt, type, class)); } } ddt_log_destroy(ddt, tx); uint64_t count; ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION)); ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); ASSERT3U(count, ==, 2); VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); ddt->ddt_dir_object = 0; spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); } /* * Determine, flags and on-disk layout from what's already stored. If there's * nothing stored, then if new is false, returns ENOENT, and if true, selects * based on pool config. */ static int ddt_configure(ddt_t *ddt, boolean_t new) { spa_t *spa = ddt->ddt_spa; char name[DDT_NAMELEN]; int error; ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); boolean_t fdt_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); boolean_t fdt_active = spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); /* * First, look for the global DDT stats object. If its not there, then * there's never been a DDT written before ever, and we know we're * starting from scratch. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); if (error != 0) { if (error != ENOENT) return (error); goto not_found; } if (fdt_active) { /* * Now look for a DDT directory. If it exists, then it has * everything we need. */ snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, zio_checksum_table[ddt->ddt_checksum].ci_name); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &ddt->ddt_dir_object); if (error == 0) { ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, sizeof (uint64_t), 1, &ddt->ddt_version); if (error != 0) return (error); error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, sizeof (uint64_t), 1, &ddt->ddt_flags); if (error != 0) return (error); if (ddt->ddt_version != DDT_VERSION_FDT) { zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " "unknown version %llu", spa_name(spa), name, (u_longlong_t)ddt->ddt_version); return (SET_ERROR(EINVAL)); } if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " "version=%llu unknown flags %llx", spa_name(spa), name, (u_longlong_t)ddt->ddt_flags, (u_longlong_t)ddt->ddt_version); return (SET_ERROR(EINVAL)); } return (0); } if (error != ENOENT) return (error); } /* Any object in the root indicates a traditional setup. */ for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_object_name(ddt, type, class, name); uint64_t obj; error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &obj); if (error == ENOENT) continue; if (error != 0) return (error); ddt->ddt_version = DDT_VERSION_LEGACY; ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; return (0); } } not_found: if (!new) return (SET_ERROR(ENOENT)); /* Nothing on disk, so set up for the best version we can */ if (fdt_enabled) { ddt->ddt_version = DDT_VERSION_FDT; ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; ddt->ddt_dir_object = 0; /* create on first use */ } else { ddt->ddt_version = DDT_VERSION_LEGACY; ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; } return (0); } static void ddt_table_alloc_kstats(ddt_t *ddt) { -#ifdef _KERNEL char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa)); char *name = kmem_asprintf("ddt_stats_%s", zio_checksum_table[ddt->ddt_checksum].ci_name); ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED, sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (ddt->ddt_ksp != NULL) { ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP); memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t)); ddt->ddt_ksp->ks_data = dds; kstat_install(ddt->ddt_ksp); } kmem_strfree(name); kmem_strfree(mod); -#else - (void) ddt; -#endif /* _KERNEL */ } static ddt_t * ddt_table_alloc(spa_t *spa, enum zio_checksum c) { ddt_t *ddt; ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); memset(ddt, 0, sizeof (ddt_t)); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; ddt->ddt_version = DDT_VERSION_UNCONFIGURED; ddt_log_alloc(ddt); ddt_table_alloc_kstats(ddt); return (ddt); } static void ddt_table_free(ddt_t *ddt) { -#ifdef _KERNEL if (ddt->ddt_ksp != NULL) { kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t)); ddt->ddt_ksp->ks_data = NULL; kstat_delete(ddt->ddt_ksp); } -#endif /* _KERNEL */ ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); kmem_cache_free(ddt_cache, ddt); } void ddt_create(spa_t *spa) { spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { if (DDT_CHECKSUM_VALID(c)) spa->spa_ddt[c] = ddt_table_alloc(spa, c); } } int ddt_load(spa_t *spa) { int error; ddt_create(spa); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); if (error) return (error == ENOENT ? 0 : error); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { if (!DDT_CHECKSUM_VALID(c)) continue; ddt_t *ddt = spa->spa_ddt[c]; error = ddt_configure(ddt, B_FALSE); if (error == ENOENT) continue; if (error != 0) return (error); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { error = ddt_object_load(ddt, type, class); if (error != 0 && error != ENOENT) return (error); } } error = ddt_log_load(ddt); if (error != 0 && error != ENOENT) return (error); DDT_KSTAT_SET(ddt, dds_log_active_entries, avl_numnodes(&ddt->ddt_log_active->ddl_tree)); DDT_KSTAT_SET(ddt, dds_log_flushing_entries, avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); /* * Seed the cached histograms. */ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); } spa->spa_dedup_dspace = ~0ULL; spa->spa_dedup_dsize = ~0ULL; return (0); } void ddt_unload(spa_t *spa) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { if (spa->spa_ddt[c]) { ddt_table_free(spa->spa_ddt[c]); spa->spa_ddt[c] = NULL; } } } boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) { ddt_t *ddt; ddt_key_t ddk; if (!BP_GET_DEDUP(bp)) return (B_FALSE); if (max_class == DDT_CLASS_UNIQUE) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; ddt_key_fill(&ddk, bp); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class <= max_class; class++) { if (ddt_object_contains(ddt, type, class, &ddk) == 0) return (B_TRUE); } } return (B_FALSE); } ddt_entry_t * ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) { ddt_key_t ddk; ddt_entry_t *dde; ddt_key_fill(&ddk, bp); dde = ddt_alloc(ddt, &ddk); ddt_alloc_entry_io(dde); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { /* * We can only do repair if there are multiple copies * of the block. For anything in the UNIQUE class, * there's definitely only one copy, so don't even try. */ if (class != DDT_CLASS_UNIQUE && ddt_object_lookup(ddt, type, class, dde) == 0) return (dde); } } memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); return (dde); } void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) { avl_index_t where; ddt_enter(ddt); if (dde->dde_io->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else ddt_free(ddt, dde); ddt_exit(ddt); } static void ddt_repair_entry_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *rdde = zio->io_private; ddt_free(ddt, rdde); } static void ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) { ddt_key_t *ddk = &dde->dde_key; ddt_key_t *rddk = &rdde->dde_key; zio_t *zio; blkptr_t blk; zio = zio_null(rio, rio->io_spa, NULL, ddt_repair_entry_done, rdde, rio->io_flags); for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_univ_phys_t *ddp = dde->dde_phys; ddt_univ_phys_t *rddp = rdde->dde_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(ddp, v); const dva_t *dvas, *rdvas; if (ddt->ddt_flags & DDT_FLAG_FLAT) { dvas = ddp->ddp_flat.ddp_dva; rdvas = rddp->ddp_flat.ddp_dva; } else { dvas = ddp->ddp_trad[p].ddp_dva; rdvas = rddp->ddp_trad[p].ddp_dva; } if (phys_birth == 0 || phys_birth != ddt_phys_birth(rddp, v) || memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } zio_nowait(zio); } static void ddt_repair_table(ddt_t *ddt, zio_t *rio) { spa_t *spa = ddt->ddt_spa; ddt_entry_t *dde, *rdde_next, *rdde; avl_tree_t *t = &ddt->ddt_repair_tree; blkptr_t blk; if (spa_sync_pass(spa) > 1) return; ddt_enter(ddt); for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, DDT_PHYS_NONE, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); ddt_enter(ddt); } ddt_exit(ddt); } static void ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx) { /* * Count all the entries stored for each type/class, and updates the * stats within (ddt_object_sync()). If there's no entries for the * type/class, the whole object is removed. If all objects for the DDT * are removed, its containing dir is removed, effectively resetting * the entire DDT to an empty slate. */ uint64_t count = 0; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { uint64_t add, tcount = 0; for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); VERIFY0(ddt_object_count(ddt, type, class, &add)); tcount += add; } } for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (tcount == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } count += tcount; } if (ddt->ddt_flags & DDT_FLAG_LOG) { /* Include logged entries in the total count */ count += avl_numnodes(&ddt->ddt_log_active->ddl_tree); count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); } if (count == 0) { /* * No entries left on the DDT, so reset the version for next * time. This allows us to handle the feature being changed * since the DDT was originally created. New entries should get * whatever the feature currently demands. */ if (ddt->ddt_version == DDT_VERSION_FDT) ddt_destroy_dir(ddt, tx); ddt->ddt_version = DDT_VERSION_UNCONFIGURED; ddt->ddt_flags = 0; } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); ddt->ddt_spa->spa_dedup_dspace = ~0ULL; ddt->ddt_spa->spa_dedup_dsize = ~0ULL; } static void ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; /* * Compute the target class, so we can decide whether or not to inform * the scrub traversal (below). Note that we don't store this in the * entry, as it might change multiple times before finally being * committed (if we're logging). Instead, we recompute it in * ddt_sync_entry(). */ uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys); ddt_class_t nclass = (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; /* * If the class changes, the order that we scan this bp changes. If it * decreases, we could miss it, so scan it right now. (This covers both * class changing while we are doing ddt_walk(), and when we are * traversing.) * * We also do this when the refcnt goes to zero, because that change is * only in the log so far; the blocks on disk won't be freed until * the log is flushed, and the refcnt might increase before that. If it * does, then we could miss it in the same way. */ if (refcnt == 0 || nclass < ddlwe->ddlwe_class) dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, ddlwe, tx); } static void ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx) { ddt_key_t *ddk = &ddlwe->ddlwe_key; ddt_type_t ntype = DDT_TYPE_DEFAULT; uint64_t refcnt = 0; /* * Compute the total refcnt. Along the way, issue frees for any DVAs * we no longer want. */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); if (ddt_phys_birth(ddp, v) == 0) { - ASSERT3U(phys_refcnt, ==, 0); + ASSERT0(phys_refcnt); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { /* * We don't want to keep any obsolete slots (eg ditto), * regardless of their refcount, but we don't want to * leak them either. So, free them. */ ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); continue; } if (phys_refcnt == 0) /* No remaining references, free it! */ ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); refcnt += phys_refcnt; } /* Select the best class for the entry. */ ddt_class_t nclass = (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; /* * If an existing entry changed type or class, or its refcount reached * zero, delete it from the DDT object */ if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || refcnt == 0)) { VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT); } /* * Add or update the entry */ if (refcnt != 0) { ddt_histogram_t *ddh = &ddt->ddt_histogram[ntype][nclass]; ddt_histogram_add_entry(ddt, ddh, ddlwe); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx)); } } /* Calculate an exponential weighted moving average, lower limited to zero */ static inline int32_t _ewma(int32_t val, int32_t prev, uint32_t weight) { ASSERT3U(val, >=, 0); ASSERT3U(prev, >=, 0); const int32_t new = MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1)); ASSERT3U(new, >=, 0); return (new); } /* Returns true if done for this txg */ static boolean_t ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) { if (ddt->ddt_flush_pass == 0) { if (spa_sync_pass(ddt->ddt_spa) == 1) { /* First run this txg, get set up */ ddt->ddt_flush_start = gethrtime(); ddt->ddt_flush_count = 0; /* * How many entries we need to flush. We want to at * least match the ingest rate. */ ddt->ddt_flush_min = MAX( ddt->ddt_log_ingest_rate, zfs_dedup_log_flush_entries_min); /* * If we've been asked to flush everything in a hurry, * try to dump as much as possible on this txg. In * this case we're only limited by time, not amount. */ if (ddt->ddt_flush_force_txg > 0) ddt->ddt_flush_min = MAX(ddt->ddt_flush_min, avl_numnodes( &ddt->ddt_log_flushing->ddl_tree)); } else { /* We already decided we're done for this txg */ return (B_FALSE); } } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { /* * We already did some flushing on this pass, skip it. This * happens when dsl_process_async_destroys() runs during a scan * (on pass 1) and does an additional ddt_sync() to update * freed blocks. */ return (B_FALSE); } if (spa_sync_pass(ddt->ddt_spa) > MAX(zfs_dedup_log_flush_passes_max, 1)) { /* Too many passes this txg, defer until next. */ ddt->ddt_flush_pass = 0; return (B_TRUE); } if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { /* Nothing to flush, done for this txg. */ ddt->ddt_flush_pass = 0; return (B_TRUE); } uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; if (elapsed_time >= target_time) { /* Too long since we started, done for this txg. */ ddt->ddt_flush_pass = 0; return (B_TRUE); } ddt->ddt_flush_pass++; ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); /* * Estimate how much time we'll need to flush the remaining entries * based on how long it normally takes. */ uint32_t want_time; if (ddt->ddt_flush_pass == 1) { /* First pass, use the average time/entries */ if (ddt->ddt_log_flush_rate == 0) /* Zero rate, just assume the whole time */ want_time = target_time; else want_time = ddt->ddt_flush_min * ddt->ddt_log_flush_time_rate / ddt->ddt_log_flush_rate; } else { /* Later pass, calculate from this txg so far */ want_time = ddt->ddt_flush_min * elapsed_time / ddt->ddt_flush_count; } /* Figure out how much time we have left */ uint32_t remain_time = target_time - elapsed_time; /* Smear the remaining entries over the remaining passes. */ uint32_t nentries = ddt->ddt_flush_min / (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); if (want_time > remain_time) { /* * We're behind; try to catch up a bit by doubling the amount * this pass. If we're behind that means we're in a later * pass and likely have most of the remaining time to * ourselves. If we're in the last couple of passes, then * doubling might just take us over the timeout, but probably * not be much, and it stops us falling behind. If we're * in the middle passes, there'll be more to do, but it * might just help us catch up a bit and we'll recalculate on * the next pass anyway. */ nentries = MIN(ddt->ddt_flush_min, nentries*2); } ddt_lightweight_entry_t ddlwe; uint32_t count = 0; while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { ddt_sync_flush_entry(ddt, &ddlwe, ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); /* End this pass if we've synced as much as we need to. */ if (++count >= nentries) break; } ddt->ddt_flush_count += count; ddt->ddt_flush_min -= count; if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { /* We emptied it, so truncate on-disk */ DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); ddt_log_truncate(ddt, tx); /* No more passes needed this txg */ ddt->ddt_flush_pass = 0; } else { /* More to do next time, save checkpoint */ DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); ddt_log_checkpoint(ddt, &ddlwe, tx); } ddt_sync_update_stats(ddt, tx); return (ddt->ddt_flush_pass == 0); } static inline void ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) { /* * If we're not forcing flush, and not being asked to start, then * there's nothing more to do. */ if (txg == 0) { /* Update requested, are we currently forcing flush? */ if (ddt->ddt_flush_force_txg == 0) return; txg = ddt->ddt_flush_force_txg; } /* * If either of the logs have entries unflushed entries before * the wanted txg, set the force txg, otherwise clear it. */ if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) && ddt->ddt_log_active->ddl_first_txg <= txg) || (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && ddt->ddt_log_flushing->ddl_first_txg <= txg)) { ddt->ddt_flush_force_txg = txg; return; } /* * Nothing to flush behind the given txg, so we can clear force flush * state. */ ddt->ddt_flush_force_txg = 0; } static void ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { ASSERT(avl_is_empty(&ddt->ddt_tree)); /* Don't do any flushing when the pool is ready to shut down */ if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) return; /* Try to flush some. */ if (!ddt_sync_flush_log_incremental(ddt, tx)) /* More to do next time */ return; /* No more flushing this txg, so we can do end-of-txg housekeeping */ if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { /* * No more to flush, and the active list has stuff, so * try to swap the logs for next time. */ if (ddt_log_swap(ddt, tx)) { DDT_KSTAT_ZERO(ddt, dds_log_active_entries); DDT_KSTAT_SET(ddt, dds_log_flushing_entries, avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); } } /* If force flush is no longer necessary, turn it off. */ ddt_flush_force_update_txg(ddt, 0); /* * Update flush rate. This is an exponential weighted moving average of * the number of entries flushed over recent txgs. */ ddt->ddt_log_flush_rate = _ewma( ddt->ddt_flush_count, ddt->ddt_log_flush_rate, zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate); /* * Update flush time rate. This is an exponential weighted moving * average of the total time taken to flush over recent txgs. */ ddt->ddt_log_flush_time_rate = _ewma( ddt->ddt_log_flush_time_rate, ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_time_rate, ddt->ddt_log_flush_time_rate); } static void ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) { uint64_t count = avl_numnodes(&ddt->ddt_tree); if (count > 0) { ddt_log_update_t dlu = {0}; ddt_log_begin(ddt, count, tx, &dlu); ddt_entry_t *dde; void *cookie = NULL; ddt_lightweight_entry_t ddlwe; while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ASSERT(dde->dde_flags & DDE_FLAG_LOADED); DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); ddt_log_entry(ddt, &ddlwe, &dlu); ddt_sync_scan_entry(ddt, &ddlwe, tx); ddt_free(ddt, dde); } ddt_log_commit(ddt, &dlu); DDT_KSTAT_SET(ddt, dds_log_active_entries, avl_numnodes(&ddt->ddt_log_active->ddl_tree)); /* * Sync the stats for the store objects. Even though we haven't * modified anything on those objects, they're no longer the * source of truth for entries that are now in the log, and we * need the on-disk counts to reflect that, otherwise we'll * miscount later when importing. */ for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) ddt_object_sync(ddt, type, class, tx); } } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); ddt->ddt_spa->spa_dedup_dspace = ~0ULL; ddt->ddt_spa->spa_dedup_dsize = ~0ULL; } if (spa_sync_pass(ddt->ddt_spa) == 1) { /* * Update ingest rate. This is an exponential weighted moving * average of the number of entries changed over recent txgs. * The ramp-up cost shouldn't matter too much because the * flusher will be trying to take at least the minimum anyway. */ ddt->ddt_log_ingest_rate = _ewma( count, ddt->ddt_log_ingest_rate, zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_ingest_rate, ddt->ddt_log_ingest_rate); } } static void ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) { if (avl_numnodes(&ddt->ddt_tree) == 0) return; ddt_entry_t *dde; void *cookie = NULL; while ((dde = avl_destroy_nodes( &ddt->ddt_tree, &cookie)) != NULL) { ASSERT(dde->dde_flags & DDE_FLAG_LOADED); ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); ddt_sync_flush_entry(ddt, &ddlwe, dde->dde_type, dde->dde_class, tx); ddt_sync_scan_entry(ddt, &ddlwe, tx); ddt_free(ddt, dde); } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); ddt->ddt_spa->spa_dedup_dspace = ~0ULL; ddt->ddt_spa->spa_dedup_dsize = ~0ULL; ddt_sync_update_stats(ddt, tx); } static void ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; if (ddt->ddt_version == UINT64_MAX) return; if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) { ASSERT0(avl_numnodes(&ddt->ddt_tree)); return; } if (spa->spa_ddt_stat_object == 0) { spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, tx); } if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) ddt_create_dir(ddt, tx); if (ddt->ddt_flags & DDT_FLAG_LOG) ddt_sync_table_log(ddt, tx); else ddt_sync_table_flush(ddt, tx); } void ddt_sync(spa_t *spa, uint64_t txg) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dmu_tx_t *tx; zio_t *rio; ASSERT3U(spa_syncing_txg(spa), ==, txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); /* * This function may cause an immediate scan of ddt blocks (see * the comment above dsl_scan_ddt() for details). We set the * scan's root zio here so that we can wait for any scan IOs in * addition to the regular ddt IOs. */ ASSERT3P(scn->scn_zio_root, ==, NULL); scn->scn_zio_root = rio; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) continue; ddt_sync_table(ddt, tx); if (ddt->ddt_flags & DDT_FLAG_LOG) ddt_sync_flush_log(ddt, tx); ddt_repair_table(ddt, rio); } (void) zio_wait(rio); scn->scn_zio_root = NULL; dmu_tx_commit(tx); } void ddt_walk_init(spa_t *spa, uint64_t txg) { if (txg == 0) txg = spa_syncing_txg(spa); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) continue; ddt_enter(ddt); ddt_flush_force_update_txg(ddt, txg); ddt_exit(ddt); } } boolean_t ddt_walk_ready(spa_t *spa) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) continue; if (ddt->ddt_flush_force_txg > 0) return (B_FALSE); } return (B_TRUE); } -int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +static int +ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe, + uint64_t flags, boolean_t wait) { do { do { do { ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; if (ddt == NULL) continue; - if (ddt->ddt_flush_force_txg > 0) + if (flags != 0 && + (ddt->ddt_flags & flags) != flags) + continue; + + if (wait && ddt->ddt_flush_force_txg > 0) return (EAGAIN); int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { error = ddt_object_walk(ddt, ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, ddlwe); } if (error == 0) return (0); if (error != ENOENT) return (error); ddb->ddb_cursor = 0; } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); ddb->ddb_checksum = 0; } while (++ddb->ddb_type < DDT_TYPES); ddb->ddb_type = 0; } while (++ddb->ddb_class < DDT_CLASSES); return (SET_ERROR(ENOENT)); } +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +{ + return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE)); +} + /* * This function is used by Block Cloning (brt.c) to increase reference * counter for the DDT entry if the block is already in DDT. * * Return false if the block, despite having the D bit set, is not present - * in the DDT. Currently this is not possible but might be in the future. - * See the comment below. + * in the DDT. This is possible when the DDT has been pruned by an admin + * or by the DDT quota mechanism. */ boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp) { ddt_t *ddt; ddt_entry_t *dde; boolean_t result; spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); ddt = ddt_select(spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { ddt_exit(ddt); spa_config_exit(spa, SCL_ZIO, FTAG); return (B_FALSE); } if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) { /* * This entry was either synced to a store object (dde_type is * real) or was logged. It must be properly on disk at this * point, so we can just bump its refcount. */ int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - /* - * This entry already existed (dde_type is real), so it must - * have refcnt >0 at the start of this txg. We are called from - * brt_pending_apply(), before frees are issued, so the refcnt - * can't be lowered yet. Therefore, it must be >0. We assert - * this because if the order of BRT and DDT interactions were - * ever to change and the refcnt was ever zero here, then - * likely further action is required to fill out the DDT entry, - * and this is a place that is likely to be missed in testing. - */ - ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* - * At the time of implementating this if the block has the - * DEDUP flag set it must exist in the DEDUP table, but - * there are many advocates that want ability to remove - * entries from DDT with refcnt=1. If this will happen, - * we may have a block with the DEDUP set, but which doesn't - * have a corresponding entry in the DDT. Be ready. + * If the block has the DEDUP flag set it still might not + * exist in the DEDUP table due to DDT pruning of entries + * where refcnt=1. */ ddt_remove(ddt, dde); result = B_FALSE; } ddt_exit(ddt); spa_config_exit(spa, SCL_ZIO, FTAG); return (result); } +typedef struct ddt_prune_entry { + ddt_t *dpe_ddt; + ddt_key_t dpe_key; + list_node_t dpe_node; + ddt_univ_phys_t dpe_phys[]; +} ddt_prune_entry_t; + +typedef struct ddt_prune_info { + spa_t *dpi_spa; + uint64_t dpi_txg_syncs; + uint64_t dpi_pruned; + list_t dpi_candidates; +} ddt_prune_info_t; + +/* + * Add prune candidates for ddt_sync during spa_sync + */ +static void +prune_candidates_sync(void *arg, dmu_tx_t *tx) +{ + (void) tx; + ddt_prune_info_t *dpi = arg; + ddt_prune_entry_t *dpe; + + spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); + + /* Process the prune candidates collected so far */ + while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { + blkptr_t blk; + ddt_t *ddt = dpe->dpe_ddt; + + ddt_enter(ddt); + + /* + * If it's on the live list, then it was loaded for update + * this txg and is no longer stale; skip it. + */ + if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) { + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + continue; + } + + ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, + dpe->dpe_phys, DDT_PHYS_FLAT, &blk); + + ddt_entry_t *dde = ddt_lookup(ddt, &blk); + if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Zero the physical, so we don't try to free DVAs + * at flush nor try to reuse this entry. + */ + ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT); + + dpi->dpi_pruned++; + } + + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + } + + spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); + dpi->dpi_txg_syncs++; +} + +/* + * Prune candidates are collected in open context and processed + * in sync context as part of ddt_sync_table(). + */ +static void +ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp) +{ + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + + size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE; + ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP); + + dpe->dpe_ddt = ddt; + dpe->dpe_key = *ddk; + memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE); + list_insert_head(list, dpe); +} + +/* + * Interate over all the entries in the DDT unique class. + * The walk will perform one of the following operations: + * (a) build a histogram than can be used when pruning + * (b) prune entries older than the cutoff + * + * Also called by zdb(8) to dump the age histogram + */ +void +ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) +{ + ddt_bookmark_t ddb = { + .ddb_class = DDT_CLASS_UNIQUE, + .ddb_type = 0, + .ddb_checksum = 0, + .ddb_cursor = 0 + }; + ddt_lightweight_entry_t ddlwe = {0}; + int error; + int total = 0, valid = 0; + int candidates = 0; + uint64_t now = gethrestime_sec(); + ddt_prune_info_t dpi; + boolean_t pruning = (cutoff != 0); + + if (pruning) { + dpi.dpi_txg_syncs = 0; + dpi.dpi_pruned = 0; + dpi.dpi_spa = spa; + list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), + offsetof(ddt_prune_entry_t, dpe_node)); + } + + if (histogram != NULL) + memset(histogram, 0, sizeof (ddt_age_histo_t)); + + while ((error = + ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); + + if (spa_shutting_down(spa) || issig()) + break; + total++; + + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1); + + uint64_t class_start = + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start; + + /* + * If this entry is on the log, then the stored entry is stale + * and we should skip it. + */ + if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) + continue; + + /* prune older entries */ + if (pruning && class_start < cutoff) { + if (candidates++ >= zfs_ddt_prunes_per_txg) { + /* sync prune candidates in batches */ + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, prune_candidates_sync, + &dpi, 0, ZFS_SPACE_CHECK_NONE)); + candidates = 1; + } + ddt_prune_entry(&dpi.dpi_candidates, ddt, + &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys); + } + + /* build a histogram */ + if (histogram != NULL) { + uint64_t age = MAX(1, (now - class_start) / 3600); + int bin = MIN(highbit64(age) - 1, HIST_BINS - 1); + histogram->dah_entries++; + histogram->dah_age_histo[bin]++; + } + + valid++; + } + + if (pruning && valid > 0) { + if (!list_is_empty(&dpi.dpi_candidates)) { + /* sync out final batch of prune candidates */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + prune_candidates_sync, &dpi, 0, + ZFS_SPACE_CHECK_NONE)); + } + list_destroy(&dpi.dpi_candidates); + + zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs", + (u_longlong_t)dpi.dpi_pruned, + (int)((dpi.dpi_pruned * 100) / valid), + (u_longlong_t)dpi.dpi_txg_syncs); + } +} + +static uint64_t +ddt_total_entries(spa_t *spa) +{ + ddt_object_t ddo; + ddt_get_dedup_object_stats(spa, &ddo); + + return (ddo.ddo_count); +} + +int +ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + uint64_t cutoff; + uint64_t start_time = gethrtime(); + + if (spa->spa_active_ddt_prune) + return (SET_ERROR(EALREADY)); + if (ddt_total_entries(spa) == 0) + return (0); + + spa->spa_active_ddt_prune = B_TRUE; + + zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, + unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); + + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + ddt_age_histo_t histogram; + uint64_t oldest = 0; + + /* Make a pass over DDT to build a histogram */ + ddt_prune_walk(spa, 0, &histogram); + + int target = (histogram.dah_entries * amount) / 100; + + /* + * Figure out our cutoff date + * (i.e., which bins to prune from) + */ + for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { + if (histogram.dah_age_histo[i] != 0) { + /* less than this bucket remaining */ + if (target < histogram.dah_age_histo[i]) { + oldest = MAX(1, (1< 0 && !spa_shutting_down(spa) && !issig()) { + /* Traverse DDT to prune entries older that our cuttoff */ + ddt_prune_walk(spa, cutoff, NULL); + } + + zfs_dbgmsg("%s: prune completed in %llu ms", + spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + + spa->spa_active_ddt_prune = B_FALSE; + return (0); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, "Max number of incremental dedup log flush passes per transaction"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, "Min time to spend on incremental dedup log flush each transaction"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, "Min number of log entries to flush each transaction"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, "Number of txgs to average flow rates across"); diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index a367d0cd02f8..3aa07dc25b91 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -1,764 +1,778 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2023, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include /* * No more than this many txgs before swapping logs. */ uint_t zfs_dedup_log_txg_max = 8; /* * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. */ uint64_t zfs_dedup_log_mem_max = 0; uint_t zfs_dedup_log_mem_max_percent = 1; static kmem_cache_t *ddt_log_entry_flat_cache; static kmem_cache_t *ddt_log_entry_trad_cache; #define DDT_LOG_ENTRY_FLAT_SIZE \ (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) #define DDT_LOG_ENTRY_TRAD_SIZE \ (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) #define DDT_LOG_ENTRY_SIZE(ddt) \ _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) void ddt_log_init(void) { ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); /* * Max memory for log AVL entries. At least 1M, because we need * something (that's ~3800 entries per tree). They can say 100% if they * want; it just means they're at the mercy of the the txg flush limit. */ if (zfs_dedup_log_mem_max == 0) { zfs_dedup_log_mem_max_percent = MIN(zfs_dedup_log_mem_max_percent, 100); zfs_dedup_log_mem_max = (physmem * PAGESIZE) * zfs_dedup_log_mem_max_percent / 100; } zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); } void ddt_log_fini(void) { kmem_cache_destroy(ddt_log_entry_trad_cache); kmem_cache_destroy(ddt_log_entry_flat_cache); } static void ddt_log_name(ddt_t *ddt, char *name, uint_t n) { snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, zio_checksum_table[ddt->ddt_checksum].ci_name, n); } static void ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; DLH_SET_VERSION(hdr, 1); DLH_SET_FLAGS(hdr, ddl->ddl_flags); hdr->dlh_length = ddl->ddl_length; hdr->dlh_first_txg = ddl->ddl_first_txg; hdr->dlh_checkpoint = ddl->ddl_checkpoint; dmu_buf_rele(db, FTAG); } static void ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) { ASSERT3U(ddt->ddt_dir_object, >, 0); ASSERT3U(ddl->ddl_object, ==, 0); char name[DDT_NAMELEN]; ddt_log_name(ddt, name, n); ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddl->ddl_object, tx)); ddl->ddl_length = 0; ddl->ddl_first_txg = tx->tx_txg; ddt_log_update_header(ddt, ddl, tx); } static void ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) { ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); } static void ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) { ASSERT3U(ddt->ddt_dir_object, >, 0); if (ddl->ddl_object == 0) return; ASSERT0(ddl->ddl_length); char name[DDT_NAMELEN]; ddt_log_name(ddt, name, n); VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); ddl->ddl_object = 0; } void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) { ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); } static void ddt_log_update_stats(ddt_t *ddt) { /* * Log object stats. We count the number of live entries in the log * tree, even if there are more than on disk, and even if the same * entry is on both append and flush trees, because that's more what * the user expects to see. This does mean the on-disk size is not * really correlated with the number of entries, but I don't think * that's reasonable to expect anyway. */ dmu_object_info_t doi; uint64_t nblocks; dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); nblocks = doi.doi_physical_blocks_512; dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); nblocks += doi.doi_physical_blocks_512; ddt_object_t *ddo = &ddt->ddt_log_stats; ddo->ddo_count = avl_numnodes(&ddt->ddt_log_active->ddl_tree) + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); ddo->ddo_dspace = nblocks << 9; } void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) { ASSERT3U(nentries, >, 0); ASSERT3P(dlu->dlu_dbp, ==, NULL); if (ddt->ddt_log_active->ddl_object == 0) ddt_log_create(ddt, tx); /* * We want to store as many entries as we can in a block, but never * split an entry across block boundaries. */ size_t reclen = P2ALIGN_TYPED( sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); ASSERT3U(reclen, <=, UINT16_MAX); dlu->dlu_reclen = reclen; VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, &dlu->dlu_dn)); dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); uint64_t nblocks = howmany(nentries, dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); uint64_t offset = ddt->ddt_log_active->ddl_length; uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, DMU_READ_NO_PREFETCH)); dlu->dlu_tx = tx; dlu->dlu_block = dlu->dlu_offset = 0; } static ddt_log_entry_t * ddt_log_alloc_entry(ddt_t *ddt) { ddt_log_entry_t *ddle; if (ddt->ddt_flags & DDT_FLAG_FLAT) { ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); } else { ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); } return (ddle); } static void ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { /* Create the log tree entry from a live or stored entry */ avl_index_t where; ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); if (ddle == NULL) { ddle = ddt_log_alloc_entry(ddt); ddle->ddle_key = ddlwe->ddlwe_key; avl_insert(&ddl->ddl_tree, ddle, where); } ddle->ddle_type = ddlwe->ddlwe_type; ddle->ddle_class = ddlwe->ddlwe_class; memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); } void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) { ASSERT3U(dlu->dlu_dbp, !=, NULL); ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); /* Get our block */ ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; /* * If this would take us past the end of the block, finish it and * move to the next one. */ if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { ASSERT3U(dlu->dlu_offset, >, 0); dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); dlu->dlu_block++; dlu->dlu_offset = 0; ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); db = dlu->dlu_dbp[dlu->dlu_block]; } /* * If this is the first time touching the block, inform the DMU that * we will fill it, and zero it out. */ if (dlu->dlu_offset == 0) { dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); memset(db->db_data, 0, db->db_size); } /* Create the log record directly in the buffer */ ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); DLR_SET_TYPE(dlr, DLR_ENTRY); DLR_SET_RECLEN(dlr, dlu->dlu_reclen); DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); ddt_log_record_entry_t *dlre = (ddt_log_record_entry_t *)&dlr->dlr_payload; dlre->dlre_key = ddlwe->ddlwe_key; memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); /* Advance offset for next record. */ dlu->dlu_offset += dlu->dlu_reclen; } void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) { ASSERT3U(dlu->dlu_dbp, !=, NULL); ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); ASSERT3U(dlu->dlu_offset, >, 0); /* * Close out the last block. Whatever we haven't used will be zeroed, * which matches DLR_INVALID, so we can detect this during load. */ dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); ddt->ddt_log_active->ddl_length += dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; dnode_rele(dlu->dlu_dn, FTAG); ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); memset(dlu, 0, sizeof (ddt_log_update_t)); ddt_log_update_stats(ddt); } boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); if (ddle == NULL) return (B_FALSE); DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); avl_remove(&ddl->ddl_tree, ddle); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); return (B_TRUE); } boolean_t -ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, - ddt_lightweight_entry_t *ddlwe) +ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) { ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); if (ddle == NULL) return (B_FALSE); - DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); - - ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); return (B_TRUE); } +boolean_t +ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = + avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); + if (!ddle) + ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); + if (!ddle) + return (B_FALSE); + if (ddlwe) + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + return (B_TRUE); +} + void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { ddt_log_t *ddl = ddt->ddt_log_flushing; ASSERT3U(ddl->ddl_object, !=, 0); #ifdef ZFS_DEBUG /* * There should not be any entries on the log tree before the given * checkpoint. Assert that this is the case. */ ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); if (ddle != NULL) VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), >, 0); #endif ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; ddl->ddl_checkpoint = ddlwe->ddlwe_key; ddt_log_update_header(ddt, ddl, tx); ddt_log_update_stats(ddt); } void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) { ddt_log_t *ddl = ddt->ddt_log_flushing; if (ddl->ddl_object == 0) return; ASSERT(avl_is_empty(&ddl->ddl_tree)); /* Eject the entire object */ dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); ddl->ddl_length = 0; ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); ddt_log_update_header(ddt, ddl, tx); ddt_log_update_stats(ddt); } boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) { /* Swap the logs. The old flushing one must be empty */ VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); /* * If there are still blocks on the flushing log, truncate it first. * This can happen if there were entries on the flushing log that were * removed in memory via ddt_lookup(); their vestigal remains are * on disk. */ if (ddt->ddt_log_flushing->ddl_length > 0) ddt_log_truncate(ddt, tx); /* * Swap policy. We swap the logs (and so begin flushing) when the * active tree grows too large, or when we haven't swapped it in * some amount of time, or if something has requested the logs be * flushed ASAP (see ddt_walk_init()). */ /* * The log tree is too large if the memory usage of its entries is over * half of the memory limit. This effectively gives each log tree half * the available memory. */ const boolean_t too_large = (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); const boolean_t too_old = tx->tx_txg >= (ddt->ddt_log_active->ddl_first_txg + MAX(1, zfs_dedup_log_txg_max)); const boolean_t force = ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; if (!(too_large || too_old || force)) return (B_FALSE); ddt_log_t *swap = ddt->ddt_log_active; ddt->ddt_log_active = ddt->ddt_log_flushing; ddt->ddt_log_flushing = swap; ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); ddt->ddt_log_active->ddl_flags &= ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; ddt_log_update_header(ddt, ddt->ddt_log_active, tx); ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); ddt_log_update_stats(ddt); return (B_TRUE); } static inline void ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, const ddt_key_t *checkpoint) { ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); ddt_log_record_entry_t *dlre = (ddt_log_record_entry_t *)dlr->dlr_payload; if (checkpoint != NULL && ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { /* Skip pre-checkpoint entries; they're already flushed. */ return; } ddt_lightweight_entry_t ddlwe; ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); ddlwe.ddlwe_key = dlre->dlre_key; memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); ddt_log_update_entry(ddt, ddl, &ddlwe); } static void ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) { void *cookie = NULL; ddt_log_entry_t *ddle; IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); while ((ddle = avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); } ASSERT(avl_is_empty(&ddl->ddl_tree)); } static int ddt_log_load_one(ddt_t *ddt, uint_t n) { ASSERT3U(n, <, 2); ddt_log_t *ddl = &ddt->ddt_log[n]; char name[DDT_NAMELEN]; ddt_log_name(ddt, name, n); uint64_t obj; int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &obj); if (err == ENOENT) return (0); if (err != 0) return (err); dnode_t *dn; err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); if (err != 0) return (err); ddt_log_header_t hdr; dmu_buf_t *db; err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) { dnode_rele(dn, FTAG); return (err); } memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); dmu_buf_rele(db, FTAG); if (DLH_GET_VERSION(&hdr) != 1) { dnode_rele(dn, FTAG); zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " "unknown version=%llu", spa_name(ddt->ddt_spa), name, (u_longlong_t)DLH_GET_VERSION(&hdr)); return (SET_ERROR(EINVAL)); } ddt_key_t *checkpoint = NULL; if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { /* * If the log has a checkpoint, then we can ignore any entries * that have already been flushed. */ ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); checkpoint = &hdr.dlh_checkpoint; } if (hdr.dlh_length > 0) { dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, ZIO_PRIORITY_SYNC_READ); for (uint64_t offset = 0; offset < hdr.dlh_length; offset += dn->dn_datablksz) { err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, DMU_READ_PREFETCH); if (err != 0) { dnode_rele(dn, FTAG); ddt_log_empty(ddt, ddl); return (err); } uint64_t boffset = 0; while (boffset < db->db_size) { ddt_log_record_t *dlr = (ddt_log_record_t *)(db->db_data + boffset); /* Partially-filled block, skip the rest */ if (DLR_GET_TYPE(dlr) == DLR_INVALID) break; switch (DLR_GET_TYPE(dlr)) { case DLR_ENTRY: ddt_log_load_entry(ddt, ddl, dlr, checkpoint); break; default: dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); ddt_log_empty(ddt, ddl); return (SET_ERROR(EINVAL)); } boffset += DLR_GET_RECLEN(dlr); } dmu_buf_rele(db, FTAG); } } dnode_rele(dn, FTAG); ddl->ddl_object = obj; ddl->ddl_flags = DLH_GET_FLAGS(&hdr); ddl->ddl_length = hdr.dlh_length; ddl->ddl_first_txg = hdr.dlh_first_txg; if (ddl->ddl_flags & DDL_FLAG_FLUSHING) ddt->ddt_log_flushing = ddl; else ddt->ddt_log_active = ddl; return (0); } int ddt_log_load(ddt_t *ddt) { int err; if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { /* * The DDT is going to be freed again in a moment, so there's * no point loading the log; it'll just slow down import. */ return (0); } ASSERT0(ddt->ddt_log[0].ddl_object); ASSERT0(ddt->ddt_log[1].ddl_object); if (ddt->ddt_dir_object == 0) { /* * If we're configured but the containing dir doesn't exist * yet, then the log object can't possibly exist either. */ ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); return (SET_ERROR(ENOENT)); } if ((err = ddt_log_load_one(ddt, 0)) != 0) return (err); if ((err = ddt_log_load_one(ddt, 1)) != 0) return (err); VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); /* * We have two finalisation tasks: * * - rebuild the histogram. We do this at the end rather than while * we're loading so we don't need to uncount and recount entries that * appear multiple times in the log. * * - remove entries from the flushing tree that are on both trees. This * happens when ddt_lookup() rehydrates an entry from the flushing * tree, as ddt_log_take_key() removes the entry from the in-memory * tree but doesn't remove it from disk. */ /* * We don't technically need a config lock here, since there shouldn't * be pool config changes during DDT load. dva_get_dsize_sync() via * ddt_stat_generate() is expecting it though, and it won't hurt * anything, so we take it. */ spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; ddt_log_entry_t *ae = avl_first(al); ddt_log_entry_t *fe = avl_first(fl); while (ae != NULL || fe != NULL) { ddt_log_entry_t *ddle; if (ae == NULL) { /* active exhausted, take flushing */ ddle = fe; fe = AVL_NEXT(fl, fe); } else if (fe == NULL) { /* flushing exuhausted, take active */ ddle = ae; ae = AVL_NEXT(al, ae); } else { /* compare active and flushing */ int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); if (c < 0) { /* active behind, take and advance */ ddle = ae; ae = AVL_NEXT(al, ae); } else if (c > 0) { /* flushing behind, take and advance */ ddle = fe; fe = AVL_NEXT(fl, fe); } else { /* match. remove from flushing, take active */ ddle = fe; fe = AVL_NEXT(fl, fe); avl_remove(fl, ddle); ddle = ae; ae = AVL_NEXT(al, ae); } } ddt_lightweight_entry_t ddlwe; DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); } spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); ddt_log_update_stats(ddt); return (0); } void ddt_log_alloc(ddt_t *ddt) { ASSERT3P(ddt->ddt_log_active, ==, NULL); ASSERT3P(ddt->ddt_log_flushing, ==, NULL); avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); ddt->ddt_log_active = &ddt->ddt_log[0]; ddt->ddt_log_flushing = &ddt->ddt_log[1]; ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; } void ddt_log_free(ddt_t *ddt) { ddt_log_empty(ddt, &ddt->ddt_log[0]); ddt_log_empty(ddt, &ddt->ddt_log[1]); avl_destroy(&ddt->ddt_log[0].ddl_tree); avl_destroy(&ddt->ddt_log[1].ddl_tree); } ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, "Max transactions before starting to flush dedup logs"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, "Max memory for dedup logs"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, "Max memory for dedup logs, as % of total memory"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 7ce2d919610f..55bf9b683f1a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1,8092 +1,8142 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; static zfsdev_state_t zfsdev_state_listhead; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ uint64_t zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ static uint64_t zfs_history_output_max = 1024 * 1024; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl, (void) cr; if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { const char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ (void) innvl; return (zfs_dozonecheck(zc->zc_name, cr)); } static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strlcpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = (char *)nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; const char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if (secpolicy_sys_config(cr, B_FALSE) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; return (secpolicy_zinject(cr)); } static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; const char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; error = spa_all_configs(&zc->zc_cookie, &configs); if (error) return (error); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } /* * inputs: * poolname name of the pool * scan_type scan func (pool_scan_func_t) * scan_command scrub pause/resume flag (pool_scrub_cmd_t) */ static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { {"scan_type", DATA_TYPE_UINT64, 0}, {"scan_command", DATA_TYPE_UINT64, 0}, }; static int zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; uint64_t scan_type, scan_cmd; if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0) return (SET_ERROR(EINVAL)); if (scan_cmd >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if (scan_cmd == POOL_SCRUB_PAUSE) { error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); } else if (scan_type == POOL_SCAN_NONE) { error = spa_scan_stop(spa); } else { error = spa_scan(spa, scan_type); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } /* * inputs: * zc_nvlist_src nvlist optionally containing ZPOOL_REGUID_GUID * zc_nvlist_src_size size of the nvlist */ static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { uint64_t *guidp = NULL; nvlist_t *props = NULL; spa_t *spa; uint64_t guid; int error; if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid); if (error == 0) guidp = &guid; else if (error == ENOENT) guidp = NULL; else goto out; } error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa, guidp); spa_close(spa, FTAG); } out: if (props != NULL) nvlist_free(props); return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config, zc->zc_flags); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_REMOVED: error = vdev_remove_wanted(spa, zc->zc_guid); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (!zc->zc_simple && zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig()) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_fast_stat(ds, &zc->zc_objset_stats); dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_USERPROP) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VOLTHREADING: err = zvol_set_volthreading(dsname, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_SNAPDEV: case ZFS_PROP_VOLMODE: err = zvol_set_common(dsname, prop, source, intval); break; case ZFS_PROP_READONLY: err = zvol_set_ro(dsname, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; int err; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } if (zfs_is_namespace_prop(prop)) should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (nvlist_empty(genericnvl)) goto out; /* * Try to set them all in one batch. */ err = dsl_props_set(dsname, source, genericnvl); if (err == 0) goto out; /* * If batching fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } out: if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } /* * innvl: { * "get_props_names": [ "prop1", "prop2", ..., "propN" ] * } */ static const zfs_ioc_key_t zfs_keys_get_props[] = { { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, }; static int zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *nvp = outnvl; spa_t *spa; char **props = NULL; unsigned int n_props = 0; int error; if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, &props, &n_props) != 0) { props = NULL; } if ((error = spa_open(pool, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) != NULL) { error = spa_prop_get(spa, &nvp); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, &nvp); } mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, &nvp); spa_close(spa, FTAG); } return (error); } /* * innvl: { * "vdevprops_set_vdev" -> guid * "vdevprops_set_props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = { {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); ASSERT(spa_writeable(spa)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * innvl: { * "vdevprops_get_vdev" -> guid * (optional) "vdevprops_get_props" -> { propname -> propid } * } * * outnvl: propname -> value */ static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = { {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL} }; static int zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if (zfs_deleg_verify_nvlist(fsaclnv) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ (void) fsname, (void) innvl, (void) outnvl; return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { (void) unused, (void) outnvl; const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } static int zfs_unmount_snap_cb(const char *snapname, void *arg) { (void) arg; zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) poolname; return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { const char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint_discard(poolname)); } /* * Loads specific types of data for the given pool * * innvl: { * "prefetch_type" -> int32_t * } * * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_prefetch[] = { {ZPOOL_PREFETCH_TYPE, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int error; spa_t *spa; int32_t type; /* * Currently, only ZPOOL_PREFETCH_DDT is supported */ if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 || type != ZPOOL_PREFETCH_DDT) { return (EINVAL); } error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); hrtime_t start_time = gethrtime(); ddt_prefetch_all(spa); zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name, (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND || cmd_type == POOL_INITIALIZE_UNINIT)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + +/* + * innvl: { + * "ddt_prune_unit" -> uint32_t + * "ddt_prune_amount" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_ddt_prune[] = { + {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0}, + {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t unit; + uint64_t amount; + + if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 || + nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) { + return (EINVAL); + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit, + amount); + + spa_close(spa, FTAG); + + return (error); +} + /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; const char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; nvlist_t *redactnvl = NULL; const char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { const char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, &valstr1) == 0); VERIFY(nvpair_value_string(p2, &valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, /* * Setting ZFS_PROP_SHARESMB requires the objset type to be * known, which is not possible prior to receipt of raw sends. */ ZFS_PROP_SHARESMB, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((input_fp = zfs_file_get(input_fd)) == NULL) return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_USERPROP) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } if (inherited_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inherited_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } if (inherited_delayprops != NULL) { ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); nvlist_free(inherited_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%') != NULL) { return (SET_ERROR(EINVAL)); } (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) { goto out; } if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) { goto out; } if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } out: nvlist_free(errors); nvlist_free(recvdprops); nvlist_free(localprops); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; const char *snapname; const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%') != NULL) { return (SET_ERROR(EINVAL)); } (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) goto out; error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) goto out; error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) goto out; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); out: nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); nvlist_free(hidden_args); return (error); } /* * When stack space is limited, we write replication stream data to the target * on a separate taskq thread, to make sure there's enough stack space. */ #ifndef HAVE_LARGE_STACKS #define USE_SEND_TASKQ 1 #endif typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } typedef struct dump_bytes_arg { zfs_file_t *dba_fp; #ifdef USE_SEND_TASKQ taskq_t *dba_tq; taskq_ent_t dba_tqent; #endif } dump_bytes_arg_t; static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg; dump_bytes_io_t dbi; dbi.dbi_fp = dba->dba_fp; dbi.dbi_buf = buf; dbi.dbi_len = len; #ifdef USE_SEND_TASKQ taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP, &dba->dba_tqent); taskq_wait(dba->dba_tq); #else dump_bytes_cb(&dbi); #endif return (dbi.dbi_err); } static int dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out) { zfs_file_t *fp = zfs_file_get(fd); if (fp == NULL) return (SET_ERROR(EBADF)); dba->dba_fp = fp; #ifdef USE_SEND_TASKQ dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0); taskq_init_ent(&dba->dba_tqent); #endif memset(out, 0, sizeof (dmu_send_outparams_t)); out->dso_outfunc = dump_bytes; out->dso_arg = dba; out->dso_dryrun = B_FALSE; return (0); } static void dump_bytes_fini(dump_bytes_arg_t *dba) { zfs_file_put(dba->dba_fp); #ifdef USE_SEND_TASKQ taskq_destroy(dba->dba_tq); #endif } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { dump_bytes_arg_t dba; dmu_send_outparams_t out; error = dump_bytes_init(&dba, zc->zc_cookie, &out); if (error) return (error); off = zfs_file_off(dba.dba_fp); error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); dump_bytes_fini(&dba); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &zc->zc_nvlist_dst_size); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. Check for remote activity. */ if (spa_multihost(spa) && spa_suspended(spa) && spa_mmp_remote_host_activity(spa)) { spa_close(spa, FTAG); return (SET_ERROR(EREMOTEIO)); } spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; minor_t minor; zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(fp); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(fp); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { (void) pool; nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { const char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { fp = zfs_onexit_fd_hold(cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (fp != NULL) { ASSERT3U(minor, !=, 0); zfs_onexit_fd_rele(fp); } return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { (void) args; return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { (void) pool; return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(fp); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { uint_t count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(fp); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; const char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int error; offset_t off; const char *fromname = NULL; int fd; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; const char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); dump_bytes_arg_t dba; dmu_send_outparams_t out; error = dump_bytes_init(&dba, fd, &out); if (error) return (error); off = zfs_file_off(dba.dba_fp); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); dump_bytes_fini(&dba); return (error); } static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { (void) os, (void) buf; uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; const char *fromname = NULL; const char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { (void) onvl; int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("zpool_prefetch", ZFS_IOC_POOL_PREFETCH, zfs_ioc_pool_prefetch, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_prefetch, ARRAY_SIZE(zfs_keys_pool_prefetch)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS, zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props, ARRAY_SIZE(zfs_keys_vdev_get_props)); zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS, zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_NONE, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE, + zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } int zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; boolean_t newzs = B_FALSE; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); /* * In order to provide for lock-free concurrent read access * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new * value). */ if (newzs) { zs->zs_minor = minor; membar_producer(); zsprev->zs_next = zs; } else { membar_producer(); zs->zs_minor = minor; } return (0); } void zfsdev_state_destroy(void *priv) { zfsdev_state_t *zs = zfsdev_private_get_state(priv); ASSERT(zs != NULL); ASSERT3S(zs->zs_minor, >, 0); /* * The last reference to this zfsdev file descriptor is being dropped. * We don't have to worry about lookup grabbing this state object, and * zfsdev_state_init() will not try to reuse this object until it is * invalidated by setting zs_minor to -1. Invalidation must be done * last, with a memory barrier to ensure ordering. This lets us avoid * taking the global zfsdev state lock around destruction. */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); zs->zs_onexit = NULL; zs->zs_zevent = NULL; membar_producer(); zs->zs_minor = -1; } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_listhead.zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); if (zs != &zfsdev_state_listhead) kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a841e0a79107..e4ccd144f091 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,5559 +1,5569 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t align, cflags, data_cflags; char name[32]; /* * Create cache for each half-power of 2 size, starting from * SPA_MINBLOCKSIZE. It should give us memory space efficiency * of ~7/8, sufficient for transient allocations mostly using * these caches. */ size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; if (!IS_P2ALIGNED(size, p2 / 2)) continue; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif if (IS_P2ALIGNED(size, PAGESIZE)) align = PAGESIZE; else align = 1 << (highbit64(size ^ (size - 1)) - 1); cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; data_cflags = KMC_NODEBUG; if (abd_size_alloc_linear(size)) { cflags |= KMC_RECLAIMABLE; data_cflags |= KMC_RECLAIMABLE; } if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ #if defined(ZFS_DEBUG) && defined(_KERNEL) #define ZFS_ZIO_BUF_CANARY 1 #endif #ifdef ZFS_ZIO_BUF_CANARY static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; /* * Use empty space after the buffer to detect overflows. * * Since zio_init() creates kmem caches only for certain set of buffer sizes, * allocations of different sizes may have some unused space after the data. * Filling part of that space with a known pattern on allocation and checking * it on free should allow us to detect some buffer overflows. */ static void zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) *canary = zio_buf_canary; } static void zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) { if (unlikely(*canary != zio_buf_canary)) { PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", p, size, (canary - p) * sizeof (ulong_t), *canary, zio_buf_canary); } } } #endif /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_buf_cache, c); #endif return (p); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_data_buf_cache, c); #endif return (p); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_buf_cache, c); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_data_buf_cache, c); #endif kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, data, zio->io_size, size, &zio->io_prop.zp_complevel); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ abd_t *abd = abd_alloc_linear(lsize, B_TRUE); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, abd, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { abd_free(abd); ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, abd, lsize, BP_SHOULD_BYTESWAP(bp), mac); abd_free(abd); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); /* Parent should not have READY stage if child doesn't have it. */ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && (cio->io_child_type != ZIO_CHILD_VDEV), (pio->io_pipeline & ZIO_STAGE_READY) == 0); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } void zio_add_child_first(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); /* Parent should not have READY stage if child doesn't have it. */ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && (cio->io_child_type != ZIO_CHILD_VDEV), (pio->io_pipeline & ZIO_STAGE_READY) == 0); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; ASSERT(list_is_empty(&cio->io_parent_list)); list_insert_head(&cio->io_parent_list, zl); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We do this if the parent's zio type matches the child's * type, or if it's a zio_null() with no done callback, and so * has no actual work to do. Otherwise dispatch the parent zio * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && (pio->io_type == zio->io_type || (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) { zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ } else { zio->io_bp = (blkptr_t *)bp; } zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } /* * ZIO intended to be between others. Provides synchronization at READY * and DONE pipeline stages and calls the respective callbacks. */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } /* * ZIO intended to be a root of a tree. Unlike null ZIO does not have a * READY pipeline stage (is ready on creation), so it should not be used * as child of any ZIO that may need waiting for grandchildren READY stage * (any other ZIO type). */ zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); return (zio); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)BP_GET_PHYSICAL_BIRTH(bp), (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (unlikely(!DMU_OT_IS_VALID(BP_GET_TYPE(bp)))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (unlikely(BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (unlikely(BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (unlikely(BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } if (unlikely(BPE_GET_PSIZE(bp) > BPE_PAYLOAD_SIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } return (errors == 0); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (unlikely(BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) return (errors == 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_SKIP: return (errors == 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the logical birth * and physical birth are not too large. However, * spa_freeze() allows the birth time of log blocks (and * dmu_sync()-ed blocks that are in the log) to be arbitrarily * large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (unlikely(vdevid >= spa->spa_root_vdev->vdev_children)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (unlikely(vd == NULL)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (unlikely(vd->vdev_ops == &vdev_hole_ops)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (unlikely(offset + asize > vd->vdev_asize)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } /* * Send a flush command to the given vdev. Unlike most zio creation functions, * the flush zios are issued immediately. You can wait on pio to pause until * the flushes complete. */ void zio_flush(zio_t *pio, vdev_t *vd) { const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY; if (vd->vdev_nowritecache) return; if (vd->vdev_children == 0) { zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); } } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * Round provided allocation size up to a value that can be allocated * by at least some vdev(s) in the pool with minimum or no additional * padding and without extra space usage on others */ static uint64_t zio_roundup_alloc_size(spa_t *spa, uint64_t size) { if (size > spa->spa_min_alloc) return (roundup(size, spa->spa_gcd_alloc)); return (spa->spa_min_alloc); } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || MIN(zp->zp_copies, spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { abd_t *cabd = NULL; if (abd_cmp_zero(zio->io_abd, lsize) == 0) psize = 0; else if (compress == ZIO_COMPRESS_EMPTY) psize = lsize; else psize = zio_compress_data(compress, zio->io_abd, &cabd, lsize, zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cabd != NULL) abd_free(cabd); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { void *cbuf = abd_borrow_buf_copy(cabd, lsize); encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); abd_return_buf(cabd, cbuf, lsize); abd_free(cabd); BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ size_t rounded = (size_t)zio_roundup_alloc_size(spa, psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; abd_free(cabd); psize = lsize; } else { abd_zero_off(cabd, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cabd, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ if (abd_cmp_zero(zio->io_abd, lsize) == 0) { psize = 0; compress = ZIO_COMPRESS_OFF; } else { psize = lsize; } } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available or cut the line otherwise. */ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; else cutinline = B_TRUE; } ASSERT3U(q, <, ZIO_TASKQ_TYPES); spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || (pio->io_pipeline & ZIO_STAGE_READY) == 0; pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); zio_link_t *zl = NULL; while ((gio = zio_walk_parents(pio, &zl)) != NULL) { for (int w = 0; w < ZIO_WAIT_TYPES; w++) { gio->io_children[pio->io_child_type][w] += !pio->io_state[w]; } } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); if (reason != ZIO_SUSPEND_MMP) { cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " "I/O failure and has been suspended.\n", spa_name(spa)); } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * If one copy was requested, store 2 copies of the GBH, so that we * can still traverse all the data (e.g. to free or scrub) even if a * block is damaged. Note that we can't store 3 copies of the GBH in * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ int gbh_copies = copies; if (gbh_copies == 1) { gbh_copies = MIN(2, spa_max_replication(spa)); } ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(zio, cio); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_t *ddt; ddt_entry_t *dde = zio->io_private; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddt = ddt_select(zio->io_spa, bp); if (zio->io_error == 0) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); /* this phys variant doesn't need repair */ ddt_phys_clear(dde->dde_phys, v); } if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) dde->dde_io->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (v_self == DDT_PHYS_NONE) return (zio); /* issue I/O for the other copies */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_io->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { if (DDT_PHYS_IS_DITTO(ddt, p)) continue; if (dde->dde_io == NULL) continue; zio_t *lio = dde->dde_io->dde_lead_zio[p]; if (lio == NULL) continue; if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); /* we're the lead, so once we're done there's no one else outstanding */ if (dde->dde_io->dde_lead_zio[p] == zio) dde->dde_io->dde_lead_zio[p] = NULL; ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; if (zio->io_error != 0) { /* * The write failed, so we're about to abort the entire IO * chain. We need to revert the entry back to what it was at * the last time it was successfully extended. */ ddt_phys_copy(ddp, orig, v); ddt_phys_clear(orig, v); ddt_exit(ddt); return; } /* * We've successfully added new DVAs to the entry. Clear the saved * state or, if there's still outstanding IO, remember it so we can * revert to a known good state if that IO fails. */ if (dde->dde_io->dde_lead_zio[p] == NULL) ddt_phys_clear(orig, v); else ddt_phys_copy(orig, ddp, v); /* * Add references for all dedup writes that were waiting on the * physical one, skipping any other physical writes that are waiting. */ zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_phys_addref(ddp, v); } ddt_exit(ddt); } static void zio_ddt_child_write_ready(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (zio->io_error != 0) return; ddt_enter(ddt); ddt_phys_extend(dde->dde_phys, v, zio->io_bp); zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); if (zio->io_bp_override == NULL) zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; /* * In the common cases, at this point we have a regular BP with no * allocated DVAs, and the corresponding DDT entry for its checksum. * Our goal is to fill the BP with enough DVAs to satisfy its copies= * requirement. * * One of three things needs to happen to fulfill this: * * - if the DDT entry has enough DVAs to satisfy the BP, we just copy * them out of the entry and return; * * - if the DDT entry has no DVAs (ie its brand new), then we have to * issue the write as normal so that DVAs can be allocated and the * data land on disk. We then copy the DVAs into the DDT entry on * return. * * - if the DDT entry has some DVAs, but too few, we have to issue the * write, adjusted to have allocate fewer copies. When it returns, we * add the new DVAs to the DDT entry, and update the BP to have the * full amount it originally requested. * * In all cases, if there's already a writing IO in flight, we need to * defer the action until after the write is done. If our action is to * write, we need to adjust our request for additional DVAs to match * what will be in the DDT entry after it completes. In this way every * IO can be guaranteed to recieve enough DVAs simply by joining the * end of the chain and letting the sequence play out. */ /* * Number of DVAs in the DDT entry. If the BP is encrypted we ignore * the third one as normal. */ int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); /* Number of DVAs requested bya the IO. */ uint8_t need_dvas = zp->zp_copies; /* * What we do next depends on whether or not there's IO outstanding that * will update this entry. */ if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { /* * No IO outstanding, so we only need to worry about ourselves. */ /* * Override BPs bring their own DVAs and their own problems. */ if (zio->io_bp_override) { /* * For a brand-new entry, all the work has been done * for us, and we can just fill it out from the provided * block and leave. */ if (have_dvas == 0) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_extend(ddp, v, bp); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * If we already have this entry, then we want to treat * it like a regular write. To do this we just wipe * them out and proceed like a regular write. * * Even if there are some DVAs in the entry, we still * have to clear them out. We can't use them to fill * out the dedup entry, as they are all referenced * together by a bp already on disk, and will be freed * as a group. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } /* * If there are enough DVAs in the entry to service our request, * then we can just use them as-is. */ if (have_dvas >= need_dvas) { ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * Otherwise, we have to issue IO to fill the entry up to the * amount we need. */ need_dvas -= have_dvas; } else { /* * There's a write in-flight. If there's already enough DVAs on * the entry, then either there were already enough to start * with, or the in-flight IO is between READY and DONE, and so * has extended the entry with new DVAs. Either way, we don't * need to do anything, we can just slot in behind it. */ if (zio->io_bp_override) { /* * If there's a write out, then we're soon going to * have our own copies of this block, so clear out the * override block and treat it as a regular dedup * write. See comment above. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } if (have_dvas >= need_dvas) { /* * A minor point: there might already be enough * committed DVAs in the entry to service our request, * but we don't know which are completed and which are * allocated but not yet written. In this case, should * the IO for the new DVAs fail, we will be on the end * of the IO chain and will also recieve an error, even * though our request could have been serviced. * * This is an extremely rare case, as it requires the * original block to be copied with a request for a * larger number of DVAs, then copied again requesting * the same (or already fulfilled) number of DVAs while * the first request is active, and then that first * request errors. In return, the logic required to * catch and handle it is complex. For now, I'm just * not going to bother with it. */ /* * We always fill the bp here as we may have arrived * after the in-flight write has passed READY, and so * missed out. */ ddt_bp_fill(ddp, v, bp, txg); zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * There's not enough in the entry yet, so we need to look at * the write in-flight and see how many DVAs it will have once * it completes. * * The in-flight write has potentially had its copies request * reduced (if we're filling out an existing entry), so we need * to reach in and get the original write to find out what it is * expecting. * * Note that the parent of the lead zio will always have the * highest zp_copies of any zio in the chain, because ones that * can be serviced without additional IO are always added to * the back of the chain. */ zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); ASSERT(pio); uint8_t parent_dvas = pio->io_prop.zp_copies; if (parent_dvas >= need_dvas) { zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * Still not enough, so we will need to issue to get the * shortfall. */ need_dvas -= parent_dvas; } /* * We need to write. We will create a new write with the copies * property adjusted to match the number of DVAs we need to need to * grow the DDT entry by to satisfy the request. */ zio_prop_t czp = *zp; czp.zp_copies = need_dvas; zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); /* * We are the new lead zio, because our parent has the highest * zp_copies that has been requested for this entry so far. */ ddt_alloc_entry_io(dde); if (dde->dde_io->dde_lead_zio[p] == NULL) { /* * First time out, take a copy of the stable entry to revert * to if there's an error (see zio_ddt_child_write_done()) */ ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); } else { /* * Make the existing chain our child, because it cannot * complete until we have. */ zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); } dde->dde_io->dde_lead_zio[p] = cio; ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp); if (dde) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); if (v != DDT_PHYS_NONE) ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt); + /* + * When no entry was found, it must have been pruned, + * so we can free it now instead of decrementing the + * refcount in the DDT. + */ + if (!dde) { + BP_SET_DEDUP(bp, 0); + zio->io_pipeline |= ZIO_STAGE_DVA_FREE; + } + return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); int allocator = zio->io_allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); nio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; } /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * When the dedup or special class is spilling into the normal * class, there can still be significant space available due * to deferred frees that are in-flight. We track the txg when * this occurred and back off adding new DDT entries for a few * txgs to allow the free blocks to be processed. */ if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) && mc == spa_special_class(spa))) && spa->spa_dedup_class_full_txg != zio->io_txg) { spa->spa_dedup_class_full_txg = zio->io_txg; zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, " "%llu allocated of %llu", spa_name(spa), (int)zio->io_txg, mc == spa_dedup_class(spa) ? "dedup" : "special", (int)zio->io_size, (u_longlong_t)metaslab_class_get_alloc(mc), (u_longlong_t)metaslab_class_get_space(mc)); } /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; VERIFY(metaslab_class_throttle_reserve( spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } zio->io_metaslab_class = mc = spa_normal_class(spa); if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) { metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), B_TRUE); } if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { /* * "no-op" injections return success, but do no actual * work. Just skip the remaining vdev stages. */ zio_vdev_io_bypass(zio); zio_interrupt(zio); return (NULL); } if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FLUSH || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { if (zio->io_type != ZIO_TYPE_FLUSH) vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } #ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; #endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); ASSERT(ZIO_HAS_ALLOCATOR(zio)); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, B_FALSE); } return (NULL); } ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs");