diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1417ab5f23e8..088ef3c05957 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1,7023 +1,7023 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright (c) 2015, 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdb.h" #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ DMU_OT_ZAP_OTHER : \ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) static char * zdb_ot_name(dmu_object_type_t type) { if (type < DMU_OT_NUMTYPES) return (dmu_ot[type].ot_name); else if ((type & DMU_OT_NEWTYPE) && ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); else return ("UNKNOWN"); } extern int reference_tracking_enable; extern int zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int zfs_reconstruct_indirect_combinations_max; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); uint64_t *zopt_object = NULL; static unsigned zopt_objects = 0; uint64_t max_inflight = 1000; static int leaked_objects = 0; static range_tree_t *mos_refd_objs; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[ [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] \n" "\t\t[ ...]\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -E decode and display block from an " "embedded block pointer\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -k examine the checkpointed state " "of the pool\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " "others)\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " "exiting\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of\n " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, " -o = set global " "variable to an unsigned 32-bit integer\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -V do verbatim import\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n"); (void) fprintf(stderr, " -Y attempt all reconstruction " "combinations for split blocks\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } static void dump_debug_buffer(void) { if (dump_opt['G']) { (void) printf("\n"); (void) fflush(stdout); zfs_dbgmsg_print("zdb"); } } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, sizeof (buf)); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { uint64_t *arr; uint64_t oursize; if (dump_opt['d'] < 6) return; if (data == NULL) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); size = doi.doi_max_offset; /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); arr = kmem_alloc(oursize, KM_SLEEP); int err = dmu_read(os, object, 0, oursize, arr, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(arr, oursize); return; } } else { /* * Even though the allocation is already done in this code path, * we still cap the size to prevent excessive printing. */ oursize = MIN(size, 1 << 20); arr = data; } if (size == 0) { (void) printf("\t\t[]\n"); return; } (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { if (i % 4 != 0) (void) printf(", %0llx", (u_longlong_t)arr[i]); else (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); } if (oursize != size) (void) printf(", ... "); (void) printf("]\n"); if (data == NULL) kmem_free(arr, oursize); } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { (void) printf("%s", (char *)prop); } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; uint64_t i; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (size >= sizeof (*bpop)) { (void) printf("\t\tnum_freed = %llu\n", (u_longlong_t)bpop->bpo_num_freed); } if (dump_opt['d'] < 5) return; for (i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } /* ARGSUSED */ static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { dmu_object_info_t doi; int64_t i; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { uint64_t obsolete_sm_object; int refcount = 0; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (vd->vdev_top == vd && obsolete_sm_object != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_object, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_object, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int get_log_spacemap_refcount(spa_t *spa) { return (avl_numnodes(&spa->spa_sm_logs_by_txg)); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", (longlong_t)sm->sm_object); (void) printf(" smp_length = 0x%llx\n", (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); if (dump_opt['d'] < 6 && dump_opt['m'] < 4) return; /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)SM_DEBUG_TXG_DECODE(word), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); entry_id++; continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; entry_id++; } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); - zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", (u_longlong_t)metaslab_unflushed_txg(msp)); } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *bias_str = ""; if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { bias_str = VDEV_ALLOC_BIAS_LOG; } else if (alloc_bias == VDEV_BIAS_SPECIAL) { bias_str = VDEV_ALLOC_BIAS_SPECIAL; } else if (alloc_bias == VDEV_BIAS_DEDUP) { bias_str = VDEV_ALLOC_BIAS_DEDUP; } uint64_t ms_flush_data_obj = 0; if (vd->vdev_top_zap != 0) { int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &ms_flush_data_obj); if (error != ENOENT) { ASSERT0(error); } } (void) printf("\tvdev %10llu %s", (u_longlong_t)vd->vdev_id, bias_str); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", (u_longlong_t)ms_flush_data_obj); } (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", "---------------", "------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_objects > 0) { c = zopt_object[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_objects > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_objects; m++) { if (zopt_object[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_object[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_object[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_log_spacemaps(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; (void) printf("\nLog Space Maps in Pool:\n"); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); (void) printf("Log Spacemap object %llu txg %llu\n", (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); dump_spacemap(spa->spa_meta_objset, sm); space_map_close(sm); } (void) printf("\n"); } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; const char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf; uint64_t resid, len, off = 0; uint_t num = 0; int error; time_t tsec; struct tm t; char tbuf[30]; char internalstr[MAXPATHLEN]; if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", __func__); return; } do { len = SPA_OLD_MAXBLOCKSIZE; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); free(buf); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; } while (len != 0); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", zfs_history_event_names[ievent], (longlong_t)txg, intstr); cmd = internalstr; } tsec = time; (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } free(buf); } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; int i; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); if (bp_freed) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); } return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); } } static void print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; uint64_t i; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu freed, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } else { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu blkptrs, " "%llu freed, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, bytes); } else { (void) printf(" %*s: object %llu, %llu blkptrs, " "%s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static int dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, boolean_t print_list) { int err = 0; zfs_bookmark_phys_t prop; objset_t *mos = dp->dp_spa->spa_meta_objset; err = dsl_bookmark_lookup(dp, name, NULL, &prop); if (err != 0) { return (err); } (void) printf("\t#%s: ", strchr(name, '#') + 1); (void) printf("{guid: %llx creation_txg: %llu creation_time: " "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, (u_longlong_t)prop.zbm_creation_txg, (u_longlong_t)prop.zbm_creation_time, (u_longlong_t)prop.zbm_redaction_obj); IMPLY(print_list, print_redact); if (!print_redact || prop.zbm_redaction_obj == 0) return (0); redaction_list_t *rl; VERIFY0(dsl_redaction_list_hold_obj(dp, prop.zbm_redaction_obj, FTAG, &rl)); redaction_list_phys_t *rlp = rl->rl_phys; (void) printf("\tRedacted:\n\t\tProgress: "); if (rlp->rlp_last_object != UINT64_MAX || rlp->rlp_last_blkid != UINT64_MAX) { (void) printf("%llu %llu (incomplete)\n", (u_longlong_t)rlp->rlp_last_object, (u_longlong_t)rlp->rlp_last_blkid); } else { (void) printf("complete\n"); } (void) printf("\t\tSnapshots: ["); for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { if (i > 0) (void) printf(", "); (void) printf("%0llu", (u_longlong_t)rlp->rlp_snaps[i]); } (void) printf("]\n\t\tLength: %llu\n", (u_longlong_t)rlp->rlp_num_entries); if (!print_list) { dsl_redaction_list_rele(rl, FTAG); return (0); } if (rlp->rlp_num_entries == 0) { dsl_redaction_list_rele(rl, FTAG); (void) printf("\t\tRedaction List: []\n\n"); return (0); } redact_block_phys_t *rbp_buf; uint64_t size; dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); size = doi.doi_max_offset; rbp_buf = kmem_alloc(size, KM_SLEEP); err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, rbp_buf, 0); if (err != 0) { dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); return (err); } (void) printf("\t\tRedaction List: [{object: %llx, offset: " "%llx, blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[0].rbp_object, (u_longlong_t)rbp_buf[0].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[0])), (u_longlong_t)redact_block_get_count(&rbp_buf[0])); for (size_t i = 1; i < rlp->rlp_num_entries; i++) { (void) printf(",\n\t\t{object: %llx, offset: %llx, " "blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[i].rbp_object, (u_longlong_t)rbp_buf[i].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[i])), (u_longlong_t)redact_block_get_count(&rbp_buf[i])); } dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); (void) printf("]\n\n"); return (0); } static void dump_bookmarks(objset_t *os, int verbosity) { zap_cursor_t zc; zap_attribute_t attr; dsl_dataset_t *ds = dmu_objset_ds(os); dsl_pool_t *dp = spa_get_dsl(os->os_spa); objset_t *mos = os->os_spa->spa_meta_objset; if (verbosity < 4) return; dsl_pool_config_enter(dp, FTAG); for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { char osname[ZFS_MAX_DATASET_NAME_LEN]; char buf[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(os, osname); VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, attr.za_name)); (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); } zap_cursor_fini(&zc); dsl_pool_config_exit(dp, FTAG); } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static int dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) { spa_t *spa = arg; uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); return (0); } static int dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) { ASSERT(arg == NULL); if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } return (0); } static void dump_blkptr_list(dsl_deadlist_t *dl, char *name) { char bytes[32]; char comp[32]; char uncomp[32]; char entries[32]; spa_t *spa = dmu_objset_spa(dl->dl_os); uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); (void) printf("\n %s: %s (%s/%s comp), %s entries\n", name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) printf("\n"); dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); } static int verify_dd_livelist(objset_t *os) { uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; dsl_pool_t *dp = spa_get_dsl(os->os_spa); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; ASSERT(!dmu_objset_is_snapshot(os)); if (!dsl_deadlist_is_open(&dd->dd_livelist)) return (0); dsl_pool_config_enter(dp, FTAG); dsl_deadlist_space(&dd->dd_livelist, &ll_used, &ll_comp, &ll_uncomp); dsl_dataset_t *origin_ds; ASSERT(dsl_pool_config_held(dp)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, &used, &comp, &uncomp)); dsl_dataset_rele(origin_ds, FTAG); dsl_pool_config_exit(dp, FTAG); /* * It's possible that the dataset's uncomp space is larger than the * livelist's because livelists do not track embedded block pointers */ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { char nice_used[32], nice_comp[32], nice_uncomp[32]; (void) printf("Discrepancy in space accounting:\n"); zdb_nicenum(used, nice_used, sizeof (nice_used)); zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("dir: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("livelist: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); return (1); } return (0); } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); /* * We can't own an objset if it's redacted. Therefore, we do this * dance: hold the objset, then acquire a long hold on its dataset, then * release the pool (which is held as part of holding the objset). */ err = dmu_objset_hold(path, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele(dmu_objset_ds(*osp), tag); *osp = NULL; } } sa_os = *osp; return (0); } static void close_objset(objset_t *os, void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dsl_dataset_long_rele(dmu_objset_ds(os), tag); dsl_dataset_rele(dmu_objset_ds(os), tag); sa_attr_table = NULL; sa_os = NULL; } static void fuid_table_destroy(void) { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } static void dump_znode_sa_xattr(sa_handle_t *hdl) { nvlist_t *sa_xattr; nvpair_t *elem = NULL; int sa_xattr_size = 0; int sa_xattr_entries = 0; int error; char *sa_xattr_packed; error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); if (error || sa_xattr_size == 0) return; sa_xattr_packed = malloc(sa_xattr_size); if (sa_xattr_packed == NULL) return; error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], sa_xattr_packed, sa_xattr_size); if (error) { free(sa_xattr_packed); return; } error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); if (error) { free(sa_xattr_packed); return; } while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) sa_xattr_entries++; (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); for (idx = 0; idx < cnt; ++idx) { if (isprint(value[idx])) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); } (void) putchar('\n'); } nvlist_free(sa_xattr); free(sa_xattr_packed); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { uint64_t projid; if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, sizeof (uint64_t)) == 0) (void) printf("\tprojid %llu\n", (u_longlong_t)projid); } if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); dump_znode_sa_xattr(hdl); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group/project used */ dump_zap, /* ZFS user/group/project quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static void dump_object(objset_t *os, uint64_t object, int verbosity, boolean_t *print_header, uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; boolean_t dnode_held = B_FALSE; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); dmu_object_info_from_dnode(dn, &doi); } else { /* * Encrypted datasets will have sensitive bonus buffers * encrypted. Therefore we cannot hold the bonus buffer and * must hold the dnode itself instead. */ error = dmu_object_info(os, object, &doi); if (error) fatal("dmu_object_info() failed, errno %u", error); if (os->os_encrypted && DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { error = dnode_hold(os, object, FTAG, &dn); if (error) fatal("dnode_hold() failed, errno %u", error); dnode_held = B_TRUE; } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } } if (dnode_slots_used) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", zdb_ot_name(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? "USEROBJUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); if (!dnode_held) { object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); } else { (void) printf("\t\t(bonus encrypted)\n"); } if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); } else { (void) printf("\t\t(object encrypted)\n"); } *print_header = B_TRUE; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } if (db != NULL) dmu_buf_rele(db, FTAG); if (dnode_held) dnode_rele(dn, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); /* * The dd_crypto_obj can be referenced by multiple dsl_dir's. * Ignore the references after the first one. */ mos_obj_refd_multiple(dd->dd_crypto_obj); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); mos_obj_refd(ds->ds_bookmarks_obj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void dump_objset(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); print_header = B_TRUE; if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) { dump_object(os, zopt_object[i], verbosity, &print_header, NULL); } (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); if (verify_dd_livelist(os) != 0) fatal("livelist is incorrect"); } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } if (dmu_objset_ds(os) != NULL) dump_bookmarks(os, verbosity); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header, NULL); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, NULL); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, NULL); } if (DMU_PROJECTUSED_DNODE(os) != NULL && DMU_PROJECTUSED_DNODE(os)->dn_type != 0) dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, &print_header, NULL); object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header, &dnode_slots); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); (void) printf(" Dnode slots:\n"); (void) printf("\tTotal used: %10llu\n", (u_longlong_t)total_slots_used); (void) printf("\tMax used: %10llu\n", (u_longlong_t)max_slot_used); (void) printf("\tPercent empty: %10lf\n", (double)(max_slot_used - total_slots_used)*100 / (double)max_slot_used); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } ASSERT3U(object_count, ==, usedobjs); if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { (void) printf("\tmmp_delay = %0llu\n", (u_longlong_t)ub->ub_mmp_delay); if (MMP_SEQ_VALID(ub)) (void) printf("\tmmp_seq = %u\n", (unsigned int) MMP_SEQ(ub)); if (MMP_FAIL_INT_VALID(ub)) (void) printf("\tmmp_fail = %u\n", (unsigned int) MMP_FAIL_INT(ub)); if (MMP_INTERVAL_VALID(ub)) (void) printf("\tmmp_write = %u\n", (unsigned int) MMP_INTERVAL(ub)); /* After MMP_* to make summarize_uberblock_mmp cleaner */ (void) printf("\tmmp_valid = %x\n", (unsigned int) ub->ub_mmp_config & 0xFF); } if (dump_opt['u'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } /* * ZFS label nvlist stats */ typedef struct zdb_nvl_stats { int zns_list_count; int zns_leaf_count; size_t zns_leaf_largest; size_t zns_leaf_total; nvlist_t *zns_string; nvlist_t *zns_uint64; nvlist_t *zns_boolean; } zdb_nvl_stats_t; static void collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) { nvlist_t *list, **array; nvpair_t *nvp = NULL; char *name; uint_t i, items; stats->zns_list_count++; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { name = nvpair_name(nvp); switch (nvpair_type(nvp)) { case DATA_TYPE_STRING: fnvlist_add_string(stats->zns_string, name, fnvpair_value_string(nvp)); break; case DATA_TYPE_UINT64: fnvlist_add_uint64(stats->zns_uint64, name, fnvpair_value_uint64(nvp)); break; case DATA_TYPE_BOOLEAN: fnvlist_add_boolean(stats->zns_boolean, name); break; case DATA_TYPE_NVLIST: if (nvpair_value_nvlist(nvp, &list) == 0) collect_nvlist_stats(list, stats); break; case DATA_TYPE_NVLIST_ARRAY: if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) break; for (i = 0; i < items; i++) { collect_nvlist_stats(array[i], stats); /* collect stats on leaf vdev */ if (strcmp(name, "children") == 0) { size_t size; (void) nvlist_size(array[i], &size, NV_ENCODE_XDR); stats->zns_leaf_total += size; if (size > stats->zns_leaf_largest) stats->zns_leaf_largest = size; stats->zns_leaf_count++; } } break; default: (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); } } } static void dump_nvlist_stats(nvlist_t *nvl, size_t cap) { zdb_nvl_stats_t stats = { 0 }; size_t size, sum = 0, total; size_t noise; /* requires nvlist with non-unique names for stat collection */ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); (void) printf("\n\nZFS Label NVList Config Stats:\n"); VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", (int)total, (int)(cap - total), 100.0 * total / cap); collect_nvlist_stats(nvl, &stats); VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", (int)fnvlist_num_pairs(stats.zns_uint64), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", (int)fnvlist_num_pairs(stats.zns_string), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", (int)fnvlist_num_pairs(stats.zns_boolean), (int)size, 100.0 * size / total); size = total - sum; /* treat remainder as nvlist overhead */ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", stats.zns_list_count, (int)size, 100.0 * size / total); if (stats.zns_leaf_count > 0) { size_t average = stats.zns_leaf_total / stats.zns_leaf_count; (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", stats.zns_leaf_count, (int)average); (void) printf("%24d bytes largest\n", (int)stats.zns_leaf_largest); if (dump_opt['l'] >= 3 && average > 0) (void) printf(" space for %d additional leaf vdevs\n", (int)((cap - total) / average)); } (void) printf("\n"); nvlist_free(stats.zns_string); nvlist_free(stats.zns_uint64); nvlist_free(stats.zns_boolean); } typedef struct cksum_record { zio_cksum_t cksum; boolean_t labels[VDEV_LABELS]; avl_node_t link; } cksum_record_t; static int cksum_record_compare(const void *x1, const void *x2) { const cksum_record_t *l = (cksum_record_t *)x1; const cksum_record_t *r = (cksum_record_t *)x2; int arraysize = ARRAY_SIZE(l->cksum.zc_word); int difference; for (int i = 0; i < arraysize; i++) { difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } return (difference); } static cksum_record_t * cksum_record_alloc(zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); rec->cksum = *cksum; rec->labels[l] = B_TRUE; return (rec); } static cksum_record_t * cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) { cksum_record_t lookup = { .cksum = *cksum }; avl_index_t where; return (avl_find(tree, &lookup, &where)); } static cksum_record_t * cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = cksum_record_lookup(tree, cksum); if (rec) { rec->labels[l] = B_TRUE; } else { rec = cksum_record_alloc(cksum, l); avl_add(tree, rec); } return (rec); } static int first_label(cksum_record_t *rec) { for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i]) return (i); return (-1); } static void print_label_numbers(char *prefix, cksum_record_t *rec) { printf("%s", prefix); for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i] == B_TRUE) printf("%d ", i); printf("\n"); } #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) typedef struct zdb_label { vdev_label_t label; nvlist_t *config_nv; cksum_record_t *config; cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; boolean_t header_printed; boolean_t read_failed; } zdb_label_t; static void print_label_header(zdb_label_t *label, int l) { if (dump_opt['q']) return; if (label->header_printed == B_TRUE) return; (void) printf("------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("------------------------------------\n"); label->header_printed = B_TRUE; } static void dump_config_from_label(zdb_label_t *label, size_t buflen, int l) { if (dump_opt['q']) return; if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) return; print_label_header(label, l); dump_nvlist(label->config_nv, 4); print_label_numbers(" labels = ", label->config); if (dump_opt['l'] >= 2) dump_nvlist_stats(label->config_nv, buflen); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) { vdev_t vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)&label->label + uoff); cksum_record_t *rec = label->uberblocks[i]; if (rec == NULL) { if (dump_opt['u'] >= 2) { print_label_header(label, label_num); (void) printf(" Uberblock[%d] invalid\n", i); } continue; } if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) continue; if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) continue; print_label_header(label, label_num); (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, " Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); print_label_numbers(" labels = ", rec); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name) { int err; boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: dump_object(os, child_obj, dump_opt['v'], &header, NULL); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); close_objset(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path); close_objset(os, FTAG); return (err); } static int dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS]; uint64_t psize, ashift; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; int fd; bzero(labels, sizeof (labels)); /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(path, dev, sizeof (path)); if (dev[0] != '/' && stat64(path, &statbuf) != 0) { int error; error = zfs_resolve_shortname(dev, path, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(path)) { if (zfs_append_partition(path, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat64(path, &statbuf) != 0)) { (void) printf("failed to find device %s, try " "specifying absolute path instead\n", dev); return (1); } } if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode) && ioctl(fd, BLKFLSBUF) != 0) (void) printf("failed to invalidate cache '%s' : %s\n", path, strerror(errno)); avl_create(&config_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); avl_create(&uberblock_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); ashift = SPA_MINBLOCKSHIFT; /* * 1. Read the label from disk * 2. Unpack the configuration and insert in config tree. * 3. Traverse all uberblocks and insert in uberblock tree. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; char *buf = label->label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); nvlist_t *config; cksum_record_t *rec; zio_cksum_t cksum; vdev_t vd; if (pread64(fd, &label->label, sizeof (label->label), vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); label->read_failed = B_TRUE; error = B_TRUE; continue; } label->read_failed = B_FALSE; if (nvlist_unpack(buf, buflen, &config, 0) == 0) { nvlist_t *vdev_tree = NULL; size_t size; if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; fletcher_4_native_varsize(buf, size, &cksum); rec = cksum_record_insert(&config_tree, &cksum, l); label->config = rec; label->config_nv = config; config_found = B_TRUE; } else { error = B_TRUE; } vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)label + uoff); if (uberblock_verify(ub)) continue; fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); rec = cksum_record_insert(&uberblock_tree, &cksum, l); label->uberblocks[i] = rec; } } /* * Dump the label and uberblocks. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); if (label->read_failed == B_TRUE) continue; if (label->config_nv) { dump_config_from_label(label, buflen, l); } else { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } if (dump_opt['u']) dump_label_uberblocks(label, ashift, l); nvlist_free(label->config_nv); } cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); cookie = NULL; while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); avl_destroy(&config_tree); avl_destroy(&uberblock_tree); (void) close(fd); return (config_found == B_FALSE ? 2 : (error == B_TRUE ? 1 : 0)); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int dump_one_objset(const char *dsname, void *arg) { int error; objset_t *os; spa_feature_t f; error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); for (f = 0; f < SPA_FEATURES; f++) { if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } for (dsl_bookmark_node_t *dbn = avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); if (dbn->dbn_phys.zbm_redaction_obj != 0) global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { global_feature_count[SPA_FEATURE_LIVELIST]++; } dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ static boolean_t same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) { vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t ms_shift = vd->vdev_ms_shift; return ((off1 >> ms_shift) == (off2 >> ms_shift)); } static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; if (same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) { zb->zb_ditto_samevdev++; if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; } break; } } spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); } static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_ios > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; int kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); int sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4dMB/s) " "estimated time remaining: %uhr %02umin %02usec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg); typedef struct unflushed_iter_cb_arg { spa_t *uic_spa; uint64_t uic_txg; void *uic_arg; zdb_log_sm_cb_t uic_cb; } unflushed_iter_cb_arg_t; static int iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) { unflushed_iter_cb_arg_t *uic = arg; return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); } static void iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); unflushed_iter_cb_arg_t uic = { .uic_spa = spa, .uic_txg = sls->sls_txg, .uic_arg = arg, .uic_cb = cb }; VERIFY0(space_map_iterate(sm, space_map_length(sm), iterate_through_spacemap_logs_cb, &uic)); space_map_close(sm); } spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { spa_vdev_removal_t *svr = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; /* skip vdevs we don't care about */ if (sme->sme_vdev != svr->svr_vdev_id) return (0); vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT(vim != NULL); if (offset >= vdev_indirect_mapping_max_offset(vim)) return (0); if (sme->sme_type == SM_ALLOC) range_tree_add(svr->svr_allocd_segs, offset, size); else range_tree_remove(svr->svr_allocd_segs, offset, size); return (0); } /* ARGSUSED */ static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return; if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT0(range_tree_space(svr->svr_allocd_segs)); range_tree_t *allocs = range_tree_create(NULL, NULL); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(allocs)); if (msp->ms_sm != NULL) VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); } range_tree_destroy(allocs); iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); /* * Clear everything past what has been synced, * because we have not allocated mappings for * it yet. */ range_tree_clear(svr->svr_allocd_segs, vdev_indirect_mapping_max_offset(vim), vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint64_t obsolete_sm_object; uint32_t *counts; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb; ddt_entry_t dde; int error; int p; ASSERT(!dump_opt['L']); bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); } ASSERT(error == ENOENT); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static int count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { int64_t *ualloc_space = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) *ualloc_space += sme->sme_run; else *ualloc_space -= sme->sme_run; return (0); } static int64_t get_unflushed_alloc_space(spa_t *spa) { if (dump_opt['L']) return (0); int64_t ualloc_space = 0; iterate_through_spacemap_logs(spa, count_unflushed_space_cb, &ualloc_space); return (ualloc_space); } static int load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { maptype_t *uic_maptype = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (*uic_maptype == sme->sme_type) range_tree_add(ms->ms_allocatable, offset, size); else range_tree_remove(ms->ms_allocatable, offset, size); return (0); } static void load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) { iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } load_unflushed_to_ms_allocatables(spa, maptype); } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (dump_opt['L']) return; dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); /* * For leak detection, we overload the ms_allocatable trees * to contain allocated segments instead of free segments. * As a result, we can't use the normal metaslab_load/unload * interfaces. */ zdb_leak_init_prepare_indirect_vdevs(spa, zcb); load_concrete_ms_allocatable_trees(spa, SM_ALLOC); /* * On load_concrete_ms_allocatable_trees() we loaded all the * allocated entries from the ms_sm to the ms_allocatable for * each metaslab. If the pool has a checkpoint or is in the * middle of discarding a checkpoint, some of these blocks * may have been freed but their ms_sm may not have been * updated because they are referenced by the checkpoint. In * order to avoid false-positives during leak-detection, we * go through the vdev's checkpoint space map and exclude all * its entries from their relevant ms_allocatable. * * We also aggregate the space held by the checkpoint and add * it to zcb_checkpoint_size. * * Note that at this point we are also verifying that all the * entries on the checkpoint_sm are marked as allocated in * the ms_sm of their relevant metaslab. * [see comment in checkpoint_sm_exclude_entry_cb()] */ zdb_leak_init_exclude_checkpoint(spa, zcb); ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; boolean_t are_precise = B_FALSE; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1 << vd->vdev_ashift) { if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1 << vd->vdev_ashift)) { obsolete_bytes += 1 << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (!are_precise && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return (B_FALSE); boolean_t leaks = B_FALSE; vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERTV(metaslab_group_t *mg = vd->vdev_mg); if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(mg, ==, msp->ms_group); /* * ms_allocatable has been overloaded * to contain allocated segments. Now that * we finished traversing all blocks, any * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_allocatable. For indirect * vdevs, space remaining in the tree * represents parts of the mapping that are * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } } } umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; return (leaks); } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } /* * Iterate over livelists which have been destroyed by the user but * are still present in the MOS, waiting to be freed */ typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) { objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; ASSERT0(err); zap_cursor_t zc; zap_attribute_t attr; dsl_deadlist_t ll; /* NULL out os prior to dsl_deadlist_open in case it's garbage */ ll.dl_os = NULL; for (zap_cursor_init(&zc, mos, zap_obj); zap_cursor_retrieve(&zc, &attr) == 0; (void) zap_cursor_advance(&zc)) { dsl_deadlist_open(&ll, mos, attr.za_first_integer); func(&ll, arg); dsl_deadlist_close(&ll); } zap_cursor_fini(&zc); } static int bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (count_block_cb(arg, bp, tx)); } static int livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) { zdb_cb_t *zbc = args; bplist_t blks; bplist_create(&blks); /* determine which blocks have been alloc'd but not freed */ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); /* count those blocks */ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); bplist_destroy(&blks); return (0); } static void livelist_count_blocks(dsl_deadlist_t *ll, void *arg) { dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); } /* * Count the blocks in the livelists that have been destroyed by the user * but haven't yet been freed. */ static void deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) { iterate_deleted_livelists(spa, livelist_count_blocks, zbc); } static void dump_livelist_cb(dsl_deadlist_t *ll, void *arg) { ASSERT3P(arg, ==, NULL); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); } /* * Print out, register object references to, and increment feature counts for * livelists that have been destroyed by the user but haven't yet been freed. */ static void deleted_livelists_dump_mos(spa_t *spa) { uint64_t zap_obj; objset_t *mos = spa->spa_meta_objset; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; mos_obj_refd(zap_obj); iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; boolean_t leaks = B_FALSE; int e, c, err; bp_embedded_type_t i; bzero(&zcb, sizeof (zcb)); (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * When leak detection is enabled we load all space maps as SM_ALLOC * maps, then traverse the pool claiming each block we discover. If * the pool is perfectly consistent, the segment trees will be empty * when we're done. Anything left over is a leak; any block we can't * claim (because it's not part of any space map) is a double * allocation, reference to a freed block, or an unclaimed log block. * * When leak detection is disabled (-L option) we still traverse the * pool claiming each block we discover, but we skip opening any space * maps. */ bzero(&zcb, sizeof (zdb_cb_t)); zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, bpobj_count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, bpobj_count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } deleted_livelists_count_blocks(spa, &zcb); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (c = 0; c < max_ncpus; c++) { (void) zio_wait(spa->spa_async_zio_root[c]); spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } /* * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ zcb.zcb_haderrors |= err; if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, &zcb); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (spa_special_class(spa)->mc_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( spa_special_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Special class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_dedup_class(spa)->mc_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( spa_dedup_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Dedup class", (u_longlong_t)alloc, 100.0 * alloc / space); } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (tzb->zb_ditto_same_ms != 0) { (void) printf("\tDittoed blocks in same metaslab: %llu\n", (longlong_t)tzb->zb_ditto_same_ms); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; const char *typename; /* make sure nicenum has enough space */ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } static void zdb_set_skip_mmp(char *target) { spa_t *spa; /* * Disable the activity check to allow examination of * active pools. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL) { spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; } mutex_exit(&spa_namespace_lock); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appened to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { zdb_set_skip_mmp(poolname); error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) return (NULL); fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) { if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { if (path_start != NULL) free(poolname); return (NULL); } } if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); range_tree_verify_not_present(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); if (dump_opt['m'] > 3) dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { ASSERT(!dump_opt['L']); spa_t *checkpoint_spa; char *checkpoint_pool; nvlist_t *config = NULL; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, config, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); if (dump_opt['m'] > 3) dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } /* ARGSUSED */ static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) range_tree_add(mos_refd_objs, obj, 1); } /* * Call on a MOS object that may already have been referenced. */ static void mos_obj_refd_multiple(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL && !range_tree_contains(mos_refd_objs, obj, 1)) range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev_top_zap(vdev_t *vd) { uint64_t ms_flush_data_obj; int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(ms_flush_data_obj); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } if (vd->vdev_top_zap != 0) { mos_obj_refd(vd->vdev_top_zap); mos_leak_vdev_top_zap(vd); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static void mos_leak_log_spacemaps(spa_t *spa) { uint64_t spacemap_zap; int error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(spacemap_zap); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) mos_obj_refd(sls->sls_sm_obj); } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); if (spa->spa_syncing_log_sm != NULL) mos_obj_refd(spa->spa_syncing_log_sm->sm_object); mos_leak_log_spacemaps(spa); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t class = 0; class < DDT_CLASSES; class++) { for (uint64_t type = 0; type < DDT_TYPES; type++) { for (uint64_t cksum = 0; cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { ddt_t *ddt = spa->spa_ddt[cksum]; mos_obj_refd(ddt->ddt_object[type][class]); } } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (range_tree_contains(mos_refd_objs, object, 1)) { range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; dmu_object_info(mos, object, &doi); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!range_tree_is_empty(mos_refd_objs)) rv = 2; range_tree_vacate(mos_refd_objs, NULL, NULL); range_tree_destroy(mos_refd_objs); return (rv); } typedef struct log_sm_obsolete_stats_arg { uint64_t lsos_current_txg; uint64_t lsos_total_entries; uint64_t lsos_valid_entries; uint64_t lsos_sm_entries; uint64_t lsos_valid_sm_entries; } log_sm_obsolete_stats_arg_t; static int log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { log_sm_obsolete_stats_arg_t *lsos = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; if (lsos->lsos_current_txg == 0) { /* this is the first log */ lsos->lsos_current_txg = txg; } else if (lsos->lsos_current_txg < txg) { /* we just changed log - print stats and reset */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos->lsos_valid_sm_entries, (u_longlong_t)lsos->lsos_sm_entries, (u_longlong_t)lsos->lsos_current_txg); lsos->lsos_valid_sm_entries = 0; lsos->lsos_sm_entries = 0; lsos->lsos_current_txg = txg; } ASSERT3U(lsos->lsos_current_txg, ==, txg); lsos->lsos_sm_entries++; lsos->lsos_total_entries++; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); lsos->lsos_valid_sm_entries++; lsos->lsos_valid_entries++; return (0); } static void dump_log_spacemap_obsolete_stats(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; log_sm_obsolete_stats_arg_t lsos; bzero(&lsos, sizeof (lsos)); (void) printf("Log Space Map Obsolete Entry Statistics:\n"); iterate_through_spacemap_logs(spa, log_spacemap_obsolete_stats_cb, &lsos); /* print stats for latest log */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos.lsos_valid_sm_entries, (u_longlong_t)lsos.lsos_sm_entries, (u_longlong_t)lsos.lsos_current_txg); (void) printf("%-8llu valid entries out of %-8llu - total\n\n", (u_longlong_t)lsos.lsos_valid_entries, (u_longlong_t)lsos.lsos_total_entries); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] > 2 || dump_opt['m']) { dump_log_spacemaps(spa); dump_log_spacemap_obsolete_stats(spa); } if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; mos_refd_objs = range_tree_create(NULL, NULL); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; uint64_t *arr; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { if (global_feature_count[f] == UINT64_MAX) continue; if (!spa_feature_is_enabled(spa, f)) { ASSERT0(global_feature_count[f]); continue; } arr = global_feature_count; } else { if (!spa_feature_is_enabled(spa, f)) { ASSERT0(dataset_feature_count[f]); continue; } arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) rc = verify_device_removal_feature_counts(spa); } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_PHYS 0x0020 #define ZDB_FLAG_RAW 0x0040 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 static int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); VERIFY(write(fileno(stdout), buf, size) == size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); #ifdef _LITTLE_ENDIAN /* correct the endianness */ do_bswap = !do_bswap; #endif for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the hierarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (s && *s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:size[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; const char *s, *vdev; char *p, *dup, *flagstr; int i, error; boolean_t borrowed = B_FALSE; dup = strdup(thing); s = strtok(dup, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); size = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); if (s) flagstr = strdup(s); else flagstr = strdup(""); s = NULL; if (size == 0) s = "size must not be zero"; if (!IS_P2ALIGNED(size, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); free(flagstr); free(dup); return; } for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { for (i = 0; flagstr[i]; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Invalid flag: %c\n", flagstr[i]); continue; } flags |= bit; /* If it's not something with an argument, keep going */ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; p = &flagstr[i + 1]; if (bit == ZDB_FLAG_PRINT_BLKPTR) { blkptr_offset = strtoull(p, &p, 16); i = p - &flagstr[i + 1]; } if (*p != ':' && *p != '\0') { (void) printf("***Invalid flag arg: '%s'\n", s); free(flagstr); free(dup); return; } } } free(flagstr); vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } psize = size; lsize = size; pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } if (flags & ZDB_FLAG_DECOMPRESS) { /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ enum zio_compress c; void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, * this could take a while and we should let the user know * we are not stuck. On the other hand, printing progress * info gets old after a while. What to do? */ for (lsize = psize + SPA_MINBLOCKSIZE; lsize <= SPA_MAXBLOCKSIZE; lsize += SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { /* * ZLE can easily decompress non zle stream. * So have an option to disable it. */ if (c == ZIO_COMPRESS_ZLE && getenv("ZDB_NO_ZLE")) continue; (void) fprintf(stderr, "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); /* * We randomize lbuf2, and decompress to both * lbuf and lbuf2. This way, we will know if * decompression fill exactly to lsize. */ VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); if (zio_decompress_data(c, pabd, lbuf, psize, lsize) == 0 && zio_decompress_data(c, pabd, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (c != ZIO_COMPRESS_FUNCTIONS) break; } umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize > SPA_MAXBLOCKSIZE) { (void) printf("Decompress of %s failed\n", thing); goto out; } buf = lbuf; size = lsize; } else { size = psize; buf = abd_borrow_buf_copy(pabd, size); borrowed = B_TRUE; } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, size, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, size, flags); if (borrowed) abd_return_buf_copy(pabd, buf, size); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; char *buf; int err; bzero(&bp, sizeof (bp)); err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } int main(int argc, char **argv) { int c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target, *target_pool; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env; boolean_t target_is_spa = B_TRUE; nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { switch (c) { case 'b': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'O': case 'R': case 's': case 'S': case 'u': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; case 'Y': zfs_reconstruct_indirect_combinations_max = INT_MAX; zfs_deadman_enabled = 0; break; /* NB: Sort single match options below. */ case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " "of inflight I/Os must be greater " "than 0\n"); usage(); } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } #if defined(_LP64) /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; #endif /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; kernel_init(FREAD); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); return (0); } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) return (dump_label(argv[0])); if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; return (dump_path(argv[0], argv[1])); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_pool = strdup(target); *strpbrk(target_pool, "/@") = '\0'; target_is_spa = B_FALSE; targetlen = strlen(target); if (targetlen && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; } else { target_pool = target; } if (dump_opt['e']) { importargs_t args = { 0 }; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; error = zpool_find_config(NULL, target_pool, &cfg, &args, &libzpool_config_ops); if (error == 0) { if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } /* * Disable the activity check to allow examination of * active pools. */ error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa * namespace. Because of that we need to make sure to call * it always after the -e option has been processed, which * imports the pool to the namespace if it's not in the * cachefile. */ char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (target_pool != target) free(target_pool); if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R']) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else if (strpbrk(target, "#") != NULL) { dsl_pool_t *dp; error = dsl_pool_hold(target, FTAG, &dp); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } error = dump_bookmark(dp, target, B_TRUE, verbose > 1); dsl_pool_rele(dp, FTAG); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } return (error); } else { zdb_set_skip_mmp(target); error = open_objset(target, FTAG, &os); if (error == 0) spa = dmu_objset_spa(os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); /* * Set the pool failure mode to panic in order to prevent the pool * from suspending. A suspended I/O will have no way to resume and * can prevent the zdb(8) command from terminating as expected. */ if (spa != NULL) spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; argv++; argc--; if (!dump_opt['R']) { if (argc > 0) { zopt_objects = argc; zopt_object = calloc(zopt_objects, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_objects; i++) { errno = 0; zopt_object[i] = strtoull(argv[i], NULL, 0); if (zopt_object[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_objset(os); } else if (zopt_objects > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } if (os != NULL) { close_objset(os, FTAG); } else { spa_close(spa, FTAG); } fuid_table_destroy(); dump_debug_buffer(); kernel_fini(); return (error); } diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 973f15d75713..7dd5fe2b54c7 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -1,144 +1,144 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_METASLAB_H #define _SYS_METASLAB_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; extern metaslab_ops_t *zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); uint64_t metaslab_unflushed_txg(metaslab_t *); uint64_t metaslab_estimated_condensed_size(metaslab_t *); int metaslab_sort_by_flushed(const void *, const void *); uint64_t metaslab_unflushed_changes_memused(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_potentially_unload(metaslab_t *, uint64_t); void metaslab_unload(metaslab_t *); boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); -uint64_t metaslab_block_maxsize(metaslab_t *); +uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags */ #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); void metaslab_alloc_trace_init(void); void metaslab_alloc_trace_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); void metaslab_class_destroy(metaslab_class_t *); int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); void metaslab_space_update(vdev_t *, metaslab_class_t *, int64_t, int64_t, int64_t); metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t); extern int metaslab_debug_load; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_H */ diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 29bc8cd5e118..08ee8d279ddd 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -1,535 +1,542 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Metaslab allocation tracing record. */ typedef struct metaslab_alloc_trace { list_node_t mat_list_node; metaslab_group_t *mat_mg; metaslab_t *mat_msp; uint64_t mat_size; uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; int mat_allocator; } metaslab_alloc_trace_t; /* * Used by the metaslab allocation tracing facility to indicate * error conditions. These errors are stored to the offset member * of the metaslab_alloc_trace_t record and displayed by mdb. */ typedef enum trace_alloc_type { TRACE_ALLOC_FAILURE = -1ULL, TRACE_TOO_SMALL = -2ULL, TRACE_FORCE_GANG = -3ULL, TRACE_NOT_ALLOCATABLE = -4ULL, TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, TRACE_VDEV_ERROR = -8ULL, TRACE_DISABLED = -9ULL, } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) #define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the * metaslabs by weight. The weight (and therefore the "best" metaslab) can * be determined in two different ways: by computing a weighted sum of all * the free space in the metaslab (a space based weight) or by counting only * the free segments of the largest size (a segment based weight). We prefer * the segment based weight because it reflects how the free space is * comprised, but we cannot always use it -- legacy pools do not have the * space map histogram information necessary to determine the largest * contiguous regions. Pools that have the space map histogram determine * the segment weight by looking at each bucket in the histogram and * determining the free space whose size in bytes is in the range: * [2^i, 2^(i+1)) * We then encode the largest index, i, that contains regions into the * segment-weighted value. * * Space-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ ((weight) == 0 || BF64_GET((weight), 60, 1)) #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include * "normal" for data block allocations (i.e. main pool allocations) or "log" * for allocations designated for intent log devices (i.e. slog devices). * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done * by traversing the metaslab groups that are linked off of the mc_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab * class defines the low-level block allocator that will be used as the * final step in allocation. These allocators are pluggable allowing each class * to use a block allocator that best suits that class. */ struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; uint64_t mc_aliquot; /* * Track the number of metaslab groups that have been initialized * and can accept allocations. An initialized metaslab group is * one has been completely added to the config (i.e. we have * updated the MOS config and the space has been added to the pool). */ uint64_t mc_groups; /* * Toggle to enable/disable the allocation throttle. */ boolean_t mc_alloc_throttle_enabled; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must * first reserve the number of blocks that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current * number of reserved allocations is maintained by the mc_alloc_slots * refcount. The mc_alloc_max_slots value determines the maximum * number of allocations that the system allows. Gang blocks are * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ uint64_t *mc_alloc_max_slots; zfs_refcount_t *mc_alloc_slots; uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked together to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will * simply find the next metaslab group in the linked list and attempt * to allocate from that group instead. */ struct metaslab_group { kmutex_t mg_lock; metaslab_t **mg_primaries; metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after * we have updated the MOS config and added the space to the pool. * We only allow allocation attempts to a metaslab group if it * has been initialized. */ boolean_t mg_initialized; uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; /* * In order for the allocation throttle to function properly, we cannot * have too many IOs going to each disk by default; the throttle * operates by allocating more work to disks that finish quickly, so * allocating larger chunks to each disk reduces its effectiveness. * However, if the number of IOs going to each allocator is too small, * we will not perform proper aggregation at the vdev_queue layer, * also resulting in decreased performance. Therefore, we will use a * ramp-up strategy. * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given * allocator on a given metaslab group, we increment its cur_max until * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to * help protect against disks that decrease in performance over time. * * It's possible for an allocator to handle more allocations than * its max. This can occur when gang blocks are required or when other * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; uint64_t *mg_cur_max_alloc_queue_depth; zfs_refcount_t *mg_alloc_queue_depth; int mg_allocators; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out * of space then its share of work must be distributed to other * groups. */ boolean_t mg_no_free_space; uint64_t mg_allocations; uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_disabled; boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; }; /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 /* * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are * allocated, the allocated segment are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees * (ms_defer). Once a block is freed it will move from the * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. * A pool could be safely rolled back TXG_DEFERS_SIZE transactions * groups and ensure that no block has been reallocated. * * The simplified transition diagram looks like this: * * * ALLOCATE * | * V * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ * | ms_freeing <--- FREE * | | * | v * | ms_freed * | | * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map. The * pool space is only updated once all metaslabs have finished syncing. * * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These * segments are represented in-core by the ms_allocatable and are stored * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core * free tree is zfs_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is * being written. */ struct metaslab { /* * This is the main lock of the metaslab and its purpose is to * coordinate our allocations and frees [e.g metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram * aggregation, or marking the metaslab for initialization. */ kmutex_t ms_lock; /* * Acquired together with the ms_lock whenever we expect to * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading * of the metaslab's space map based on ms_synced_length. That * said, the function still uses the ms_sync_lock after it * has read the ms_sm [see relevant comment in metaslab_load() * as to why]. */ kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; uint64_t ms_fragmentation; range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ range_tree_t *ms_freeing; /* to free this syncing txg */ range_tree_t *ms_freed; /* already freed this syncing txg */ range_tree_t *ms_defer[TXG_DEFER_SIZE]; range_tree_t *ms_checkpointing; /* to add to the checkpoint */ /* * The ms_trim tree is the set of allocatable segments which are * eligible for trimming. (When the metaslab is loaded, it's a * subset of ms_allocatable.) It's kept in-core as long as the * autotrim property is set and is not vacated when the metaslab * is unloaded. Its purpose is to aggregate freed ranges to * facilitate efficient trimming. */ range_tree_t *ms_trim; boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; /* * The number of consumers which have disabled the metaslab. */ uint64_t ms_disabled; /* * We must always hold the ms_lock when modifying ms_loaded * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; kcondvar_t ms_flush_cv; boolean_t ms_flushing; /* * The following histograms count entries that are in the * metaslab's space map (and its histogram) but are not in * ms_allocatable yet, because they are in ms_freed, ms_freeing, * or ms_defer[]. * * When the metaslab is not loaded, its ms_weight needs to * reflect what is allocatable (i.e. what will be part of * ms_allocatable if it is loaded). The weight is computed from * the spacemap histogram, but that includes ranges that are * not yet allocatable (because they are in ms_freed, * ms_freeing, or ms_defer[]). Therefore, when calculating the * weight, we need to remove those ranges. * * The ranges in the ms_freed and ms_defer[] range trees are all * present in the spacemap. However, the spacemap may have * multiple entries to represent a contiguous range, because it * is written across multiple sync passes, but the changes of * all sync passes are consolidated into the range trees. * Adjacent ranges that are freed in different sync passes of * one txg will be represented separately (as 2 or more entries) * in the space map (and its histogram), but these adjacent * ranges will be consolidated (represented as one entry) in the * ms_freed/ms_defer[] range trees (and their histograms). * * When calculating the weight, we can not simply subtract the * range trees' histograms from the spacemap's histogram, * because the range trees' histograms may have entries in * higher buckets than the spacemap, due to consolidation. * Instead we must subtract the exact entries that were added to * the spacemap's histogram. ms_synchist and ms_deferhist[] * represent these exact entries, so we can subtract them from * the spacemap's histogram when calculating ms_weight. * * ms_synchist represents the same ranges as ms_freeing + * ms_freed, but without consolidation across sync passes. * * ms_deferhist[i] represents the same ranges as ms_defer[i], * but without consolidation across sync passes. */ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; /* * Tracks the exact amount of allocated space of this metaslab * (and specifically the metaslab's space map) up to the most * recently completed sync pass [see usage in metaslab_sync()]. */ uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ /* * Track of whenever a metaslab is selected for loading or allocation. * We use this value to determine how long the metaslab should * stay cached. */ uint64_t ms_selected_txg; + /* + * ms_load/unload_time can be used for performance monitoring + * (e.g. by dtrace or mdb). + */ + hrtime_t ms_load_time; /* time last loaded */ + hrtime_t ms_unload_time; /* time last unloaded */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* * -1 if it's not active in an allocator, otherwise set to the allocator * this metaslab is active for. */ int ms_allocator; boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always * contain the same number of segments as the ms_allocatable. The * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ avl_tree_t ms_allocatable_by_size; + avl_tree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ /* * Allocs and frees that are committed to the vdev log spacemap but * not yet to this metaslab's spacemap. */ range_tree_t *ms_unflushed_allocs; range_tree_t *ms_unflushed_frees; /* * We have flushed entries up to but not including this TXG. In * other words, all changes from this TXG and onward should not * be in this metaslab's space map and must be read from the * log space maps. */ uint64_t ms_unflushed_txg; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; boolean_t ms_new; }; typedef struct metaslab_unflushed_phys { /* on-disk counterpart of ms_unflushed_txg */ uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_IMPL_H */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index fce1df68ddd2..e299613b86d3 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -1,132 +1,134 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H #include #include #ifdef __cplusplus extern "C" { #endif #define RANGE_TREE_HISTOGRAM_SIZE 64 typedef struct range_tree_ops range_tree_ops_t; /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { avl_tree_t rt_root; /* offset-ordered segment AVL tree */ uint64_t rt_space; /* sum of all segments in the map */ uint64_t rt_gap; /* allowable inter-segment gap */ range_tree_ops_t *rt_ops; /* rt_avl_compare should only be set if rt_arg is an AVL tree */ void *rt_arg; int (*rt_avl_compare)(const void *, const void *); /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: * 2^i <= size of range in bytes < 2^(i+1) */ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; typedef struct range_seg { avl_node_t rs_node; /* AVL node */ avl_node_t rs_pp_node; /* AVL picker-private node */ uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ } range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); void range_tree_init(void); void range_tree_fini(void); range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, int (*avl_compare) (const void *, const void *), uint64_t gap); range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); uint64_t range_tree_numsegs(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); uint64_t range_tree_min(range_tree_t *rt); uint64_t range_tree_max(range_tree_t *rt); uint64_t range_tree_span(range_tree_t *rt); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); range_seg_t *range_tree_first(range_tree_t *rt); void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto); void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto); void rt_avl_create(range_tree_t *rt, void *arg); void rt_avl_destroy(range_tree_t *rt, void *arg); void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); void rt_avl_vacate(range_tree_t *rt, void *arg); extern struct range_tree_ops rt_avl_ops; #ifdef __cplusplus } #endif #endif /* _SYS_RANGE_TREE_H */ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 3b88d974817d..6c15f11b7eb4 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1,3645 +1,3661 @@ '\" te .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019" .SH NAME zfs\-module\-parameters \- ZFS module parameters .SH DESCRIPTION .sp .LP Description of the different parameters to the ZFS module. .SS "Module parameters" .sp .LP .sp .ne 2 .na \fBdbuf_cache_max_bytes\fR (ulong) .ad .RS 12n Maximum size in bytes of the dbuf cache. When \fB0\fR this value will default to \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size, otherwise the provided value in bytes will be used. The behavior of the dbuf cache and its associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBdbuf_metadata_cache_max_bytes\fR (ulong) .ad .RS 12n Maximum size in bytes of the metadata dbuf cache. When \fB0\fR this value will default to \fB1/2^dbuf_cache_shift\fR (1/16) of the target ARC size, otherwise the provided value in bytes will be used. The behavior of the metadata dbuf cache and its associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBdbuf_cache_hiwater_pct\fR (uint) .ad .RS 12n The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted directly. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBdbuf_cache_lowater_pct\fR (uint) .ad .RS 12n The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops evicting dbufs. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBdbuf_cache_shift\fR (int) .ad .RS 12n Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction of the target arc size. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBdbuf_metadata_cache_shift\fR (int) .ad .RS 12n Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR, to a log2 fraction of the target arc size. .sp Default value: \fB6\fR. .RE .sp .ne 2 .na \fBdmu_prefetch_max\fR (int) .ad .RS 12n Limit the amount we can prefetch with one call to this amount (in bytes). This helps to limit the amount of memory that can be used by prefetching. .sp Default value: \fB134,217,728\fR (128MB). .RE .sp .ne 2 .na \fBignore_hole_birth\fR (int) .ad .RS 12n This is an alias for \fBsend_holes_without_birth_time\fR. .RE .sp .ne 2 .na \fBl2arc_feed_again\fR (int) .ad .RS 12n Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as fast as possible. .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBl2arc_feed_min_ms\fR (ulong) .ad .RS 12n Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only applicable in related situations. .sp Default value: \fB200\fR. .RE .sp .ne 2 .na \fBl2arc_feed_secs\fR (ulong) .ad .RS 12n Seconds between L2ARC writing .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBl2arc_headroom\fR (ulong) .ad .RS 12n How far through the ARC lists to search for L2ARC cacheable content, expressed as a multiplier of \fBl2arc_write_max\fR .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBl2arc_headroom_boost\fR (ulong) .ad .RS 12n Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being successfully compressed before writing. A value of 100 disables this feature. .sp Default value: \fB200\fR%. .RE .sp .ne 2 .na \fBl2arc_noprefetch\fR (int) .ad .RS 12n Do not write buffers to L2ARC if they were prefetched but not used by applications .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBl2arc_norw\fR (int) .ad .RS 12n No reads during writes .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBl2arc_write_boost\fR (ulong) .ad .RS 12n Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount while they remain cold. .sp Default value: \fB8,388,608\fR. .RE .sp .ne 2 .na \fBl2arc_write_max\fR (ulong) .ad .RS 12n Max write bytes per interval .sp Default value: \fB8,388,608\fR. .RE .sp .ne 2 .na \fBmetaslab_aliquot\fR (ulong) .ad .RS 12n Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. In normal operation, ZFS will try to write this amount of data to a top-level vdev before moving on to the next one. .sp Default value: \fB524,288\fR. .RE .sp .ne 2 .na \fBmetaslab_bias_enabled\fR (int) .ad .RS 12n Enable metaslab group biasing based on its vdev's over- or under-utilization relative to the pool. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBmetaslab_force_ganging\fR (ulong) .ad .RS 12n Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. .sp Default value: \fB16,777,217\fR. .RE .sp .ne 2 .na \fBzfs_keep_log_spacemaps_at_export\fR (int) .ad .RS 12n Prevent log spacemaps from being destroyed during pool exports and destroys. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_metaslab_segment_weight_enabled\fR (int) .ad .RS 12n Enable/disable segment-based metaslab selection. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBzfs_metaslab_switch_threshold\fR (int) .ad .RS 12n When using segment-based metaslab selection, continue allocating from the active metaslab until \fBzfs_metaslab_switch_threshold\fR worth of buckets have been exhausted. .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBmetaslab_debug_load\fR (int) .ad .RS 12n Load all metaslabs during pool import. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBmetaslab_debug_unload\fR (int) .ad .RS 12n Prevent metaslabs from being unloaded. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBmetaslab_fragmentation_factor_enabled\fR (int) .ad .RS 12n Enable use of the fragmentation metric in computing metaslab weights. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBmetaslab_df_max_search\fR (int) .ad .RS 12n Maximum distance to search forward from the last offset. Without this limit, fragmented pools can see >100,000 iterations and metaslab_block_picker() becomes the performance limiting factor on high-performance storage. With the default setting of 16MB, we typically see less than 500 iterations, even with very fragmented, ashift=9 pools. The maximum number of iterations possible is: \fBmetaslab_df_max_search / (2 * (1<> \fBzfs_arc_overflow_shift\fR". The default value of 8 causes the ARC to be considered to be overflowing if it exceeds the target size by 1/256th (0.3%) of the target size. When the ARC is overflowing, new buffer allocations are stalled until the reclaim thread catches up and the overflow condition no longer exists. .sp Default value: \fB8\fR. .RE .sp .ne 2 .na \fBzfs_arc_p_min_shift\fR (int) .ad .RS 12n If set to a non zero value, this will update arc_p_min_shift (default 4) with the new value. arc_p_min_shift is used to shift of arc_c for calculating both min and max max arc_p .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_p_dampener_disable\fR (int) .ad .RS 12n Disable arc_p adapt dampener .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_arc_shrink_shift\fR (int) .ad .RS 12n If set to a non zero value, this will update arc_shrink_shift (default 7) with the new value. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_pc_percent\fR (uint) .ad .RS 12n Percent of pagecache to reclaim arc to This tunable allows ZFS arc to play more nicely with the kernel's LRU pagecache. It can guarantee that the arc size won't collapse under scanning pressure on the pagecache, yet still allows arc to be reclaimed down to zfs_arc_min if necessary. This value is specified as percent of pagecache size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This only operates during memory pressure/reclaim. .sp Default value: \fB0\fR% (disabled). .RE .sp .ne 2 .na \fBzfs_arc_sys_free\fR (ulong) .ad .RS 12n The target number of bytes the ARC should leave as free memory on the system. Defaults to the larger of 1/64 of physical memory or 512K. Setting this option to a non-zero value will override the default. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_autoimport_disable\fR (int) .ad .RS 12n Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR). .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBzfs_checksums_per_second\fR (int) .ad .RS 12n Rate limit checksum events to this many per second. Note that this should not be set below the zed thresholds (currently 10 checksums over 10 sec) or else zed may not trigger any action. .sp Default value: 20 .RE .sp .ne 2 .na \fBzfs_commit_timeout_pct\fR (int) .ad .RS 12n This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. The timeout is scaled based on a percentage of the last lwb latency to avoid significantly impacting the latency of each individual transaction record (itx). .sp Default value: \fB5\fR%. .RE .sp .ne 2 .na \fBzfs_condense_indirect_vdevs_enable\fR (int) .ad .RS 12n Enable condensing indirect vdev mappings. When set to a non-zero value, attempt to condense indirect vdev mappings if the mapping uses more than \fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_condense_max_obsolete_bytes\fR (ulong) .ad .RS 12n Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes (see \fBfBzfs_condense_indirect_vdevs_enable\fR). .sp Default value: \fB1,073,741,824\fR. .RE .sp .ne 2 .na \fBzfs_condense_min_mapping_bytes\fR (ulong) .ad .RS 12n Minimum size vdev mapping to attempt to condense (see \fBzfs_condense_indirect_vdevs_enable\fR). .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzfs_dbgmsg_enable\fR (int) .ad .RS 12n Internally ZFS keeps a small log to facilitate debugging. By default the log is disabled, to enable it set this option to 1. The contents of the log can be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file. Writing 0 to this proc file clears the log. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_dbgmsg_maxsize\fR (int) .ad .RS 12n The maximum size in bytes of the internal ZFS debug log. .sp Default value: \fB4M\fR. .RE .sp .ne 2 .na \fBzfs_dbuf_state_index\fR (int) .ad .RS 12n This feature is currently unused. It is normally used for controlling what reporting is available under /proc/spl/kstat/zfs. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_deadman_enabled\fR (int) .ad .RS 12n When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR milliseconds, or when an individual I/O takes longer than \fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is invoked as described by the \fBzfs_deadman_failmode\fR module option. By default the deadman is enabled and configured to \fBwait\fR which results in "hung" I/Os only being logged. The deadman is automatically disabled when a pool gets suspended. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_deadman_failmode\fR (charp) .ad .RS 12n Controls the failure behavior when the deadman detects a "hung" I/O. Valid values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR. .sp \fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a "deadman" event will be posted describing that I/O. .sp \fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it to the I/O pipeline if possible. .sp \fBpanic\fR - Panic the system. This can be used to facilitate an automatic fail-over to a properly configured fail-over partner. .sp Default value: \fBwait\fR. .RE .sp .ne 2 .na \fBzfs_deadman_checktime_ms\fR (int) .ad .RS 12n Check time in milliseconds. This defines the frequency at which we check for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior. .sp Default value: \fB60,000\fR. .RE .sp .ne 2 .na \fBzfs_deadman_synctime_ms\fR (ulong) .ad .RS 12n Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes. .sp Default value: \fB600,000\fR. .RE .sp .ne 2 .na \fBzfs_deadman_ziotime_ms\fR (ulong) .ad .RS 12n Interval in milliseconds after which the deadman is triggered and an individual I/O operation is considered to be "hung". As long as the I/O remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR milliseconds until the I/O completes. .sp Default value: \fB300,000\fR. .RE .sp .ne 2 .na \fBzfs_dedup_prefetch\fR (int) .ad .RS 12n Enable prefetching dedup-ed blks .sp Use \fB1\fR for yes and \fB0\fR to disable (default). .RE .sp .ne 2 .na \fBzfs_delay_min_dirty_percent\fR (int) .ad .RS 12n Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of \fBzfs_dirty_data_max\fR. This value should be >= zfs_vdev_async_write_active_max_dirty_percent. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB60\fR%. .RE .sp .ne 2 .na \fBzfs_delay_scale\fR (int) .ad .RS 12n This controls how quickly the transaction delay approaches infinity. Larger values cause longer delays for a given amount of dirty data. .sp For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between 10x and 1/10th this number. .sp See the section "ZFS TRANSACTION DELAY". .sp Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. .sp Default value: \fB500,000\fR. .RE .sp .ne 2 .na \fBzfs_slow_io_events_per_second\fR (int) .ad .RS 12n Rate limit delay zevents (which report slow I/Os) to this many per second. .sp Default value: 20 .RE .sp .ne 2 .na \fBzfs_unflushed_max_mem_amt\fR (ulong) .ad .RS 12n Upper-bound limit for unflushed metadata changes to be held by the log spacemap in memory (in bytes). .sp Default value: \fB1,073,741,824\fR (1GB). .RE .sp .ne 2 .na \fBzfs_unflushed_max_mem_ppm\fR (ulong) .ad .RS 12n Percentage of the overall system memory that ZFS allows to be used for unflushed metadata changes by the log spacemap. (value is calculated over 1000000 for finer granularity). .sp Default value: \fB1000\fR (which is divided by 1000000, resulting in the limit to be \fB0.1\fR% of memory) .RE .sp .ne 2 .na \fBzfs_unflushed_log_block_max\fR (ulong) .ad .RS 12n Describes the maximum number of log spacemap blocks allowed for each pool. The default value of 262144 means that the space in all the log spacemaps can add up to no more than 262144 blocks (which means 32GB of logical space before compression and ditto blocks, assuming that blocksize is 128k). .sp This tunable is important because it involves a trade-off between import time after an unclean export and the frequency of flushing metaslabs. The higher this number is, the more log blocks we allow when the pool is active which means that we flush metaslabs less often and thus decrease the number of I/Os for spacemap updates per TXG. At the same time though, that means that in the event of an unclean export, there will be more log spacemap blocks for us to read, inducing overhead in the import time of the pool. The lower the number, the amount of flushing increases destroying log blocks quicker as they become obsolete faster, which leaves less blocks to be read during import time after a crash. .sp Each log spacemap block existing during pool import leads to approximately one extra logical I/O issued. This is the reason why this tunable is exposed in terms of blocks rather than space used. .sp Default value: \fB262144\fR (256K). .RE .sp .ne 2 .na \fBzfs_unflushed_log_block_min\fR (ulong) .ad .RS 12n If the number of metaslabs is small and our incoming rate is high, we could get into a situation that we are flushing all our metaslabs every TXG. Thus we always allow at least this many log blocks. .sp Default value: \fB1000\fR. .RE .sp .ne 2 .na \fBzfs_unflushed_log_block_pct\fR (ulong) .ad .RS 12n Tunable used to determine the number of blocks that can be used for the spacemap log, expressed as a percentage of the total number of metaslabs in the pool. .sp Default value: \fB400\fR (read as \fB400\fR% - meaning that the number of log spacemap blocks are capped at 4 times the number of metaslabs in the pool). .RE .sp .ne 2 .na \fBzfs_unlink_suspend_progress\fR (uint) .ad .RS 12n When enabled, files will not be asynchronously removed from the list of pending unlinks and the space they consume will be leaked. Once this option has been disabled and the dataset is remounted, the pending unlinks will be processed and the freed space returned to the pool. This option is used by the test suite to facilitate testing. .sp Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress. .RE .sp .ne 2 .na \fBzfs_delete_blocks\fR (ulong) .ad .RS 12n This is the used to define a large file for the purposes of delete. Files containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously while smaller files are deleted synchronously. Decreasing this value will reduce the time spent in an unlink(2) system call at the expense of a longer delay before the freed space is available. .sp Default value: \fB20,480\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_max\fR (int) .ad .RS 12n Determines the dirty space limit in bytes. Once this limit is exceeded, new writes are halted until space frees up. This parameter takes precedence over \fBzfs_dirty_data_max_percent\fR. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_max\fR (int) .ad .RS 12n Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. This limit is only enforced at module load time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. This parameter takes precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB25\fR% of physical RAM. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_max_percent\fR (int) .ad .RS 12n Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a percentage of physical RAM. This limit is only enforced at module load time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this one. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB25\fR%. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_percent\fR (int) .ad .RS 12n Determines the dirty space limit, expressed as a percentage of all memory. Once this limit is exceeded, new writes are halted until space frees up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this one. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_sync_percent\fR (int) .ad .RS 12n Start syncing out a transaction group if there's at least this much dirty data as a percentage of \fBzfs_dirty_data_max\fR. This should be less than \fBzfs_vdev_async_write_active_min_dirty_percent\fR. .sp Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. .RE .sp .ne 2 .na \fBzfs_fletcher_4_impl\fR (string) .ad .RS 12n Select a fletcher 4 implementation. .sp Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, \fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR. All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction set extensions to be available and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR results in the original, CPU based calculation, being used. Selecting any option other than \fBfastest\fR and \fBscalar\fR results in vector instructions from the respective CPU instruction set being used. .sp Default value: \fBfastest\fR. .RE .sp .ne 2 .na \fBzfs_free_bpobj_enabled\fR (int) .ad .RS 12n Enable/disable the processing of the free_bpobj object. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_async_block_max_blocks\fR (ulong) .ad .RS 12n Maximum number of blocks freed in a single txg. .sp Default value: \fB100,000\fR. .RE .sp .ne 2 .na \fBzfs_override_estimate_recordsize\fR (ulong) .ad .RS 12n Record size calculation override for zfs send estimates. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_read_max_active\fR (int) .ad .RS 12n Maximum asynchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB3\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_read_min_active\fR (int) .ad .RS 12n Minimum asynchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) .ad .RS 12n When the pool has more than \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use \fBzfs_vdev_async_write_max_active\fR to limit active async writes. If the dirty data is between min and max, the active I/O limit is linearly interpolated. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB60\fR%. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) .ad .RS 12n When the pool has less than \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use \fBzfs_vdev_async_write_min_active\fR to limit active async writes. If the dirty data is between min and max, the active I/O limit is linearly interpolated. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB30\fR%. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_max_active\fR (int) .ad .RS 12n Maximum asynchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_min_active\fR (int) .ad .RS 12n Minimum asynchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Lower values are associated with better latency on rotational media but poorer resilver performance. The default value of 2 was chosen as a compromise. A value of 3 has been shown to improve resilver performance further at a cost of further increasing latency. .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_initializing_max_active\fR (int) .ad .RS 12n Maximum initializing I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_initializing_min_active\fR (int) .ad .RS 12n Minimum initializing I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_max_active\fR (int) .ad .RS 12n The maximum number of I/Os active to each device. Ideally, this will be >= the sum of each queue's max_active. It must be at least the sum of each queue's min_active. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_vdev_removal_max_active\fR (int) .ad .RS 12n Maximum removal I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_removal_min_active\fR (int) .ad .RS 12n Minimum removal I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scrub_max_active\fR (int) .ad .RS 12n Maximum scrub I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scrub_min_active\fR (int) .ad .RS 12n Minimum scrub I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_read_max_active\fR (int) .ad .RS 12n Maximum synchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_read_min_active\fR (int) .ad .RS 12n Minimum synchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_write_max_active\fR (int) .ad .RS 12n Maximum synchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_write_min_active\fR (int) .ad .RS 12n Minimum synchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_trim_max_active\fR (int) .ad .RS 12n Maximum trim/discard I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_trim_min_active\fR (int) .ad .RS 12n Minimum trim/discard I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_queue_depth_pct\fR (int) .ad .RS 12n Maximum number of queued allocations per top-level vdev expressed as a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the system to detect devices that are more capable of handling allocations and to allocate more blocks to those devices. It allows for dynamic allocation distribution when devices are imbalanced as fuller devices will tend to be slower than empty devices. See also \fBzio_dva_throttle_enabled\fR. .sp Default value: \fB1000\fR%. .RE .sp .ne 2 .na \fBzfs_expire_snapshot\fR (int) .ad .RS 12n Seconds to expire .zfs/snapshot .sp Default value: \fB300\fR. .RE .sp .ne 2 .na \fBzfs_admin_snapshot\fR (int) .ad .RS 12n Allow the creation, removal, or renaming of entries in the .zfs/snapshot directory to cause the creation, destruction, or renaming of snapshots. When enabled this functionality works both locally and over NFS exports which have the 'no_root_squash' option set. This functionality is disabled by default. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_flags\fR (int) .ad .RS 12n Set additional debugging flags. The following flags may be bitwise-or'd together. .sp .TS box; rB lB lB lB r l. Value Symbolic Name Description _ 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. _ 2 ZFS_DEBUG_DBUF_VERIFY * Enable extra dbuf verifications. _ 4 ZFS_DEBUG_DNODE_VERIFY * Enable extra dnode verifications. _ 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. _ 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. _ 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. _ 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. _ 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-core range_trees. _ 512 ZFS_DEBUG_SET_ERROR Enable SET_ERROR and dprintf entries in the debug log. _ 1024 ZFS_DEBUG_INDIRECT_REMAP Verify split blocks created by device removal. _ 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. _ 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log and enable zfs_dbgmsgs for metaslab loading and flushing. .TE .sp * Requires debug build. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_free_leak_on_eio\fR (int) .ad .RS 12n If destroy encounters an EIO while reading metadata (e.g. indirect blocks), space referenced by the missing metadata can not be freed. Normally this causes the background destroy to become "stalled", as it is unable to make forward progress. While in this stalled state, all remaining space to free from the error-encountering filesystem is "temporarily leaked". Set this flag to cause it to ignore the EIO, permanently leak the space from indirect blocks that can not be read, and continue to free everything else that it can. The default, "stalling" behavior is useful if the storage partially fails (i.e. some but not all i/os fail), and then later recovers. In this case, we will be able to continue pool operations while it is partially failed, and when it recovers, we can continue to free the space, with no leaks. However, note that this case is actually fairly rare. Typically pools either (a) fail completely (but perhaps temporarily, e.g. a top-level vdev going offline), or (b) have localized, permanent errors (e.g. disk returns the wrong data due to bit flip or firmware bug). In case (a), this setting does not matter because the pool will be suspended and the sync thread will not be able to make forward progress regardless. In case (b), because the error is permanent, the best we can do is leak the minimum amount of space, which is what setting this flag will do. Therefore, it is reasonable for this flag to normally be set, but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_free_min_time_ms\fR (int) .ad .RS 12n During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum of this much time will be spent working on freeing blocks per txg. .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_immediate_write_sz\fR (long) .ad .RS 12n Largest data block to write to zil. Larger blocks will be treated as if the dataset being written to had the property setting \fBlogbias=throughput\fR. .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfs_initialize_value\fR (ulong) .ad .RS 12n Pattern written to vdev free space by \fBzpool initialize\fR. .sp Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). .RE .sp .ne 2 .na \fBzfs_livelist_max_entries\fR (ulong) .ad .RS 12n The threshold size (in block pointers) at which we create a new sub-livelist. Larger sublists are more costly from a memory perspective but the fewer sublists there are, the lower the cost of insertion. .sp Default value: \fB500,000\fR. .RE .sp .ne 2 .na \fBzfs_livelist_min_percent_shared\fR (int) .ad .RS 12n If the amount of shared space between a snapshot and its clone drops below this threshold, the clone turns off the livelist and reverts to the old deletion method. This is in place because once a clone has been overwritten enough livelists no long give us a benefit. .sp Default value: \fB75\fR. .RE .sp .ne 2 .na \fBzfs_livelist_condense_new_alloc\fR (int) .ad .RS 12n Incremented each time an extra ALLOC blkptr is added to a livelist entry while it is being condensed. This option is used by the test suite to track race conditions. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_livelist_condense_sync_cancel\fR (int) .ad .RS 12n Incremented each time livelist condensing is canceled while in spa_livelist_condense_sync. This option is used by the test suite to track race conditions. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_livelist_condense_sync_pause\fR (int) .ad .RS 12n When set, the livelist condense process pauses indefinitely before executing the synctask - spa_livelist_condense_sync. This option is used by the test suite to trigger race conditions. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_livelist_condense_zthr_cancel\fR (int) .ad .RS 12n Incremented each time livelist condensing is canceled while in spa_livelist_condense_cb. This option is used by the test suite to track race conditions. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_livelist_condense_zthr_pause\fR (int) .ad .RS 12n When set, the livelist condense process pauses indefinitely before executing the open context condensing work in spa_livelist_condense_cb. This option is used by the test suite to trigger race conditions. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_lua_max_instrlimit\fR (ulong) .ad .RS 12n The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. .sp Default value: \fB100,000,000\fR. .RE .sp .ne 2 .na \fBzfs_lua_max_memlimit\fR (ulong) .ad .RS 12n The maximum memory limit that can be set for a ZFS channel program, specified in bytes. .sp Default value: \fB104,857,600\fR. .RE .sp .ne 2 .na \fBzfs_max_dataset_nesting\fR (int) .ad .RS 12n The maximum depth of nested datasets. This value can be tuned temporarily to fix existing datasets that exceed the predefined limit. .sp Default value: \fB50\fR. .RE .sp .ne 2 .na \fBzfs_max_log_walking\fR (ulong) .ad .RS 12n The number of past TXGs that the flushing algorithm of the log spacemap feature uses to estimate incoming log blocks. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_max_logsm_summary_length\fR (ulong) .ad .RS 12n Maximum number of rows allowed in the summary of the spacemap log. .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_max_recordsize\fR (int) .ad .RS 12n We currently support block sizes from 512 bytes to 16MB. The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on i/o latency, and also potentially on the memory allocator. Therefore, we do not allow the recordsize to be set larger than zfs_max_recordsize (default 1MB). Larger blocks can be created by changing this tunable, and pools with larger blocks can always be imported and used, regardless of this setting. .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_allow_redacted_dataset_mount\fR (int) .ad .RS 12n Allow datasets received with redacted send/receive to be mounted. Normally disabled because these datasets may be missing key data. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_min_metaslabs_to_flush\fR (ulong) .ad .RS 12n Minimum number of metaslabs to flush per dirty TXG .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_metaslab_fragmentation_threshold\fR (int) .ad .RS 12n Allow metaslabs to keep their active state as long as their fragmentation percentage is less than or equal to this value. An active metaslab that exceeds this threshold will no longer keep its active status allowing better metaslabs to be selected. .sp Default value: \fB70\fR. .RE .sp .ne 2 .na \fBzfs_mg_fragmentation_threshold\fR (int) .ad .RS 12n Metaslab groups are considered eligible for allocations if their fragmentation metric (measured as a percentage) is less than or equal to this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. .sp Default value: \fB95\fR. .RE .sp .ne 2 .na \fBzfs_mg_noalloc_threshold\fR (int) .ad .RS 12n Defines a threshold at which metaslab groups should be eligible for allocations. The value is expressed as a percentage of free space beyond which a metaslab group is always eligible for allocations. If a metaslab group's free space is less than or equal to the threshold, the allocator will avoid allocating to that group unless all groups in the pool have reached the threshold. Once all groups have reached the threshold, all groups are allowed to accept allocations. The default value of 0 disables the feature and causes all metaslab groups to be eligible for allocations. This parameter allows one to deal with pools having heavily imbalanced vdevs such as would be the case when a new vdev has been added. Setting the threshold to a non-zero percentage will stop allocations from being made to vdevs that aren't filled to the specified percentage and allow lesser filled vdevs to acquire more allocations than they otherwise would under the old \fBzfs_mg_alloc_failures\fR facility. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_ddt_data_is_special\fR (int) .ad .RS 12n If enabled, ZFS will place DDT data into the special allocation class. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_user_indirect_is_special\fR (int) .ad .RS 12n If enabled, ZFS will place user data (both file and zvol) indirect blocks into the special allocation class. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_multihost_history\fR (int) .ad .RS 12n Historical statistics for the last N multihost updates will be available in \fB/proc/spl/kstat/zfs//multihost\fR .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_multihost_interval\fR (ulong) .ad .RS 12n Used to control the frequency of multihost writes which are performed when the \fBmultihost\fR pool property is on. This is one factor used to determine the length of the activity check during import. .sp The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR milliseconds. On average a multihost write will be issued for each leaf vdev every \fBzfs_multihost_interval\fR milliseconds. In practice, the observed period can vary with the I/O load and this observed value is the delay which is stored in the uberblock. .sp Default value: \fB1000\fR. .RE .sp .ne 2 .na \fBzfs_multihost_import_intervals\fR (uint) .ad .RS 12n Used to control the duration of the activity test on import. Smaller values of \fBzfs_multihost_import_intervals\fR will reduce the import time but increase the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. .sp On import the activity check waits a minimum amount of time determined by \fBzfs_multihost_interval * zfs_multihost_import_intervals\fR, or the same product computed on the host which last had the pool imported (whichever is greater). The activity check time may be further extended if the value of mmp delay found in the best uberblock indicates actual multihost updates happened at longer intervals than \fBzfs_multihost_interval\fR. A minimum value of \fB100ms\fR is enforced. .sp A value of 0 is ignored and treated as if it was set to 1. .sp Default value: \fB20\fR. .RE .sp .ne 2 .na \fBzfs_multihost_fail_intervals\fR (uint) .ad .RS 12n Controls the behavior of the pool when multihost write failures or delays are detected. .sp When \fBzfs_multihost_fail_intervals = 0\fR, multihost write failures or delays are ignored. The failures will still be reported to the ZED which depending on its configuration may take action such as suspending the pool or offlining a device. .sp When \fBzfs_multihost_fail_intervals > 0\fR, the pool will be suspended if \fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds pass without a successful mmp write. This guarantees the activity test will see mmp writes if the pool is imported. A value of 1 is ignored and treated as if it was set to 2. This is necessary to prevent the pool from being suspended due to normal, small I/O latency variations. .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_no_scrub_io\fR (int) .ad .RS 12n Set for no scrub I/O. This results in scrubs not actually scrubbing data and simply doing a metadata crawl of the pool instead. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_no_scrub_prefetch\fR (int) .ad .RS 12n Set to disable block prefetching for scrubs. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_nocacheflush\fR (int) .ad .RS 12n Disable cache flush operations on disks when writing. Setting this will cause pool corruption on power loss if a volatile out-of-order write cache is enabled. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_nopwrite_enabled\fR (int) .ad .RS 12n Enable NOP writes .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_dmu_offset_next_sync\fR (int) .ad .RS 12n Enable forcing txg sync to find holes. When enabled forces ZFS to act like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which when a dnode is dirty causes txg's to be synced so that this data can be found. .sp Use \fB1\fR for yes and \fB0\fR to disable (default). .RE .sp .ne 2 .na \fBzfs_pd_bytes_max\fR (int) .ad .RS 12n The number of bytes which should be prefetched during a pool traversal (eg: \fBzfs send\fR or other data crawling operations) .sp Default value: \fB52,428,800\fR. .RE .sp .ne 2 .na \fBzfs_per_txg_dirty_frees_percent \fR (ulong) .ad .RS 12n Tunable to control percentage of dirtied indirect blocks from frees allowed into one TXG. After this threshold is crossed, additional frees will wait until the next TXG. A value of zero will disable this throttle. .sp Default value: \fB5\fR, set to \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_prefetch_disable\fR (int) .ad .RS 12n This tunable disables predictive prefetch. Note that it leaves "prescient" prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, prescient prefetch never issues i/os that end up not being needed, so it can't hurt performance. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_qat_checksum_disable\fR (int) .ad .RS 12n This tunable disables qat hardware acceleration for sha256 checksums. It may be set after the zfs modules have been loaded to initialize the qat hardware as long as support is compiled in and the qat driver is present. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_qat_compress_disable\fR (int) .ad .RS 12n This tunable disables qat hardware acceleration for gzip compression. It may be set after the zfs modules have been loaded to initialize the qat hardware as long as support is compiled in and the qat driver is present. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_qat_encrypt_disable\fR (int) .ad .RS 12n This tunable disables qat hardware acceleration for AES-GCM encryption. It may be set after the zfs modules have been loaded to initialize the qat hardware as long as support is compiled in and the qat driver is present. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_read_chunk_size\fR (long) .ad .RS 12n Bytes to read per chunk .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_read_history\fR (int) .ad .RS 12n Historical statistics for the last N reads will be available in \fB/proc/spl/kstat/zfs//reads\fR .sp Default value: \fB0\fR (no data is kept). .RE .sp .ne 2 .na \fBzfs_read_history_hits\fR (int) .ad .RS 12n Include cache hits in read history .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_reconstruct_indirect_combinations_max\fR (int) .ad .RS 12na If an indirect split block contains more than this many possible unique combinations when being reconstructed, consider it too computationally expensive to check them all. Instead, try at most \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected combinations each time the block is accessed. This allows all segment copies to participate fairly in the reconstruction when all combinations cannot be checked and prevents repeated use of one bad copy. .sp Default value: \fB4096\fR. .RE .sp .ne 2 .na \fBzfs_recover\fR (int) .ad .RS 12n Set to attempt to recover from fatal errors. This should only be used as a last resort, as it typically results in leaked space, or worse. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_removal_ignore_errors\fR (int) .ad .RS 12n .sp Ignore hard IO errors during device removal. When set, if a device encounters a hard IO error during the removal process the removal will not be cancelled. This can result in a normally recoverable block becoming permanently damaged and is not recommended. This should only be used as a last resort when the pool cannot be returned to a healthy state prior to removing the device. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_removal_suspend_progress\fR (int) .ad .RS 12n .sp This is used by the test suite so that it can ensure that certain actions happen while in the middle of a removal. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_remove_max_segment\fR (int) .ad .RS 12n .sp The largest contiguous segment that we will attempt to allocate when removing a device. This can be no larger than 16MB. If there is a performance problem with attempting to allocate large blocks, consider decreasing this. .sp Default value: \fB16,777,216\fR (16MB). .RE .sp .ne 2 .na \fBzfs_resilver_min_time_ms\fR (int) .ad .RS 12n Resilvers are processed by the sync thread. While resilvering it will spend at least this much time working on a resilver between txg flushes. .sp Default value: \fB3,000\fR. .RE .sp .ne 2 .na \fBzfs_scan_ignore_errors\fR (int) .ad .RS 12n If set to a nonzero value, remove the DTL (dirty time list) upon completion of a pool scan (scrub) even if there were unrepairable errors. It is intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scrub_min_time_ms\fR (int) .ad .RS 12n Scrubs are processed by the sync thread. While scrubbing it will spend at least this much time working on a scrub between txg flushes. .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_scan_checkpoint_intval\fR (int) .ad .RS 12n To preserve progress across reboots the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verifications I/Os to disk. The frequency of this flushing is determined by the \fBzfs_scan_checkpoint_intval\fR tunable. .sp Default value: \fB7200\fR seconds (every 2 hours). .RE .sp .ne 2 .na \fBzfs_scan_fill_weight\fR (int) .ad .RS 12n This tunable affects how scrub and resilver I/O segments are ordered. A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. Changing the value afterwards will have no affect on scrub or resilver performance. .sp Default value: \fB3\fR. .RE .sp .ne 2 .na \fBzfs_scan_issue_strategy\fR (int) .ad .RS 12n Determines the order that data will be verified while scrubbing or resilvering. If set to \fB1\fR, data will be verified as sequentially as possible, given the amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This may improve scrub performance if the pool's data is very fragmented. If set to \fB2\fR, the largest mostly-contiguous chunk of found data will be verified first. By deferring scrubbing of small segments, we may later find adjacent data to coalesce and increase the segment size. If set to \fB0\fR, zfs will use strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a checkpoint. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scan_legacy\fR (int) .ad .RS 12n A value of 0 indicates that scrubs and resilvers will gather metadata in memory before issuing sequential I/O. A value of 1 indicates that the legacy algorithm will be used where I/O is initiated as soon as it is discovered. Changing this value to 0 will not affect scrubs or resilvers that are already in progress. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scan_max_ext_gap\fR (int) .ad .RS 12n Indicates the largest gap in bytes between scrub / resilver I/Os that will still be considered sequential for sorting purposes. Changing this value will not affect scrubs or resilvers that are already in progress. .sp Default value: \fB2097152 (2 MB)\fR. .RE .sp .ne 2 .na \fBzfs_scan_mem_lim_fact\fR (int) .ad .RS 12n Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. This tunable determines the hard limit for I/O sorting memory usage. When the hard limit is reached we stop scanning metadata and start issuing data verification I/O. This is done until we get below the soft limit. .sp Default value: \fB20\fR which is 5% of RAM (1/20). .RE .sp .ne 2 .na \fBzfs_scan_mem_lim_soft_fact\fR (int) .ad .RS 12n The fraction of the hard limit used to determined the soft limit for I/O sorting by the sequential scan algorithm. When we cross this limit from bellow no action is taken. When we cross this limit from above it is because we are issuing verification I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. .sp Default value: \fB20\fR which is 5% of the hard limit (1/20). .RE .sp .ne 2 .na \fBzfs_scan_vdev_limit\fR (int) .ad .RS 12n Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. .sp Default value: \fB41943040\fR. .RE .sp .ne 2 .na \fBzfs_send_corrupt_data\fR (int) .ad .RS 12n Allow sending of corrupt data (ignore read/checksum errors when sending data) .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_send_unmodified_spill_blocks\fR (int) .ad .RS 12n Include unmodified spill blocks in the send stream. Under certain circumstances previous versions of ZFS could incorrectly remove the spill block from an existing object. Including unmodified copies of the spill blocks creates a backwards compatible stream which will recreate a spill block if it was incorrectly removed. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBzfs_send_no_prefetch_queue_ff\fR (int) .ad .RS 12n The fill fraction of the \fBzfs send\fR internal queues. The fill fraction controls the timing with which internal threads are woken up. .sp Default value: \fB20\fR. .RE .sp .ne 2 .na \fBzfs_send_no_prefetch_queue_length\fR (int) .ad .RS 12n The maximum number of bytes allowed in \fBzfs send\fR's internal queues. .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_send_queue_ff\fR (int) .ad .RS 12n The fill fraction of the \fBzfs send\fR prefetch queue. The fill fraction controls the timing with which internal threads are woken up. .sp Default value: \fB20\fR. .RE .sp .ne 2 .na \fBzfs_send_queue_length\fR (int) .ad .RS 12n The maximum number of bytes allowed that will be prefetched by \fBzfs send\fR. This value must be at least twice the maximum block size in use. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_recv_queue_ff\fR (int) .ad .RS 12n The fill fraction of the \fBzfs receive\fR queue. The fill fraction controls the timing with which internal threads are woken up. .sp Default value: \fB20\fR. .RE .sp .ne 2 .na \fBzfs_recv_queue_length\fR (int) .ad .RS 12n The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value must be at least twice the maximum block size in use. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_override_estimate_recordsize\fR (ulong) .ad .RS 12n Setting this variable overrides the default logic for estimating block sizes when doing a zfs send. The default heuristic is that the average block size will be the current recordsize. Override this value if most data in your dataset is not of that size and you require accurate zfs send size estimates. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_deferred_free\fR (int) .ad .RS 12n Flushing of data to disk is done in passes. Defer frees starting in this pass .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_spa_discard_memory_limit\fR (int) .ad .RS 12n Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_special_class_metadata_reserve_pct\fR (int) .ad .RS 12n Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this value. This ensures reserved space is available for pool meta data as the special vdevs approach capacity. .sp Default value: \fB25\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_dont_compress\fR (int) .ad .RS 12n Starting in this sync pass, we disable compression (including of metadata). With the default setting, in practice, we don't have this many sync passes, so this has no effect. .sp The original intent was that disabling compression would help the sync passes to converge. However, in practice disabling compression increases the average number of sync passes, because when we turn compression off, a lot of block's size will change and thus we have to re-allocate (not overwrite) them. It also increases the number of 128KB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The 128K allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy 128K allocations. .sp Default value: \fB8\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_rewrite\fR (int) .ad .RS 12n Rewrite new block pointers starting in this pass .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_sync_taskq_batch_pct\fR (int) .ad .RS 12n This controls the number of threads used by the dp_sync_taskq. The default value of 75% will create a maximum of one thread per cpu. .sp Default value: \fB75\fR%. .RE .sp .ne 2 .na \fBzfs_trim_extent_bytes_max\fR (unsigned int) .ad .RS 12n Maximum size of TRIM command. Ranges larger than this will be split in to chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being issued to the device. .sp Default value: \fB134,217,728\fR. .RE .sp .ne 2 .na \fBzfs_trim_extent_bytes_min\fR (unsigned int) .ad .RS 12n Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped unless they're part of a larger range which was broken in to chunks. This is done because it's common for these small TRIMs to negatively impact overall performance. This value can be set to 0 to TRIM all unallocated space. .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfs_trim_metaslab_skip\fR (unsigned int) .ad .RS 12n Skip uninitialized metaslabs during the TRIM process. This option is useful for pools constructed from large thinly-provisioned devices where TRIM operations are slow. As a pool ages an increasing fraction of the pools metaslabs will be initialized progressively degrading the usefulness of this option. This setting is stored when starting a manual TRIM and will persist for the duration of the requested TRIM. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_trim_queue_limit\fR (unsigned int) .ad .RS 12n Maximum number of queued TRIMs outstanding per leaf vdev. The number of concurrent TRIM commands issued to the device is controlled by the \fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module options. .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_trim_txg_batch\fR (unsigned int) .ad .RS 12n The number of transaction groups worth of frees which should be aggregated before TRIM operations are issued to the device. This setting represents a trade-off between issuing larger, more efficient TRIM operations and the delay before the recently trimmed space is available for use by the device. .sp Increasing this value will allow frees to be aggregated for a longer time. This will result is larger TRIM operations and potentially increased memory usage. Decreasing this value will have the opposite effect. The default value of 32 was determined to be a reasonable compromise. .sp Default value: \fB32\fR. .RE .sp .ne 2 .na \fBzfs_txg_history\fR (int) .ad .RS 12n Historical statistics for the last N txgs will be available in \fB/proc/spl/kstat/zfs//txgs\fR .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_txg_timeout\fR (int) .ad .RS 12n Flush dirty data to disk at least every N seconds (maximum txg duration) .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_vdev_aggregate_trim\fR (int) .ad .RS 12n Allow TRIM I/Os to be aggregated. This is normally not helpful because the extents to be trimmed will have been already been aggregated by the metaslab. This option is provided for debugging and performance analysis. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_aggregation_limit\fR (int) .ad .RS 12n Max vdev I/O aggregation size .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_vdev_aggregation_limit_non_rotating\fR (int) .ad .RS 12n Max vdev I/O aggregation size for non-rotating media .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzfs_vdev_cache_bshift\fR (int) .ad .RS 12n Shift size to inflate reads too .sp Default value: \fB16\fR (effectively 65536). .RE .sp .ne 2 .na \fBzfs_vdev_cache_max\fR (int) .ad .RS 12n Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR size (default 64k). .sp Default value: \fB16384\fR. .RE .sp .ne 2 .na \fBzfs_vdev_cache_size\fR (int) .ad .RS 12n Total size of the per-disk cache in bytes. .sp Currently this feature is disabled as it has been found to not be helpful for performance and in some cases harmful. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O immediately follows its predecessor on rotational vdevs for the purpose of making decisions based on load. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_seek_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O lacks locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within this that are not immediately following the previous I/O are incremented by half. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_seek_offset\fR (int) .ad .RS 12n The maximum distance for the last queued I/O in which the balancing algorithm considers an I/O to have locality. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1048576\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_non_rotating_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member on non-rotational vdevs when I/Os do not immediately follow one another. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O lacks locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within this that are not immediately following the previous I/O are incremented by half. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_read_gap_limit\fR (int) .ad .RS 12n Aggregate read I/O operations if the gap on-disk between them is within this threshold. .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scheduler\fR (charp) .ad .RS 12n Set the Linux I/O scheduler on whole disk vdevs to this scheduler. Valid options are noop, cfq, bfq & deadline .sp Default value: \fBnoop\fR. .RE .sp .ne 2 .na \fBzfs_vdev_write_gap_limit\fR (int) .ad .RS 12n Aggregate write I/O over gap .sp Default value: \fB4,096\fR. .RE .sp .ne 2 .na \fBzfs_vdev_raidz_impl\fR (string) .ad .RS 12n Parameter for selecting raidz parity implementation to use. Options marked (always) below may be selected on module load as they are supported on all systems. The remaining options may only be set after the module is loaded, as they are available only if the implementations are compiled in and supported on the running system. Once the module is loaded, the content of /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options with the currently selected one enclosed in []. Possible options are: fastest - (always) implementation selected using built-in benchmark original - (always) original raidz implementation scalar - (always) scalar raidz implementation sse2 - implementation using SSE2 instruction set (64bit x86 only) ssse3 - implementation using SSSE3 instruction set (64bit x86 only) avx2 - implementation using AVX2 instruction set (64bit x86 only) avx512f - implementation using AVX512F instruction set (64bit x86 only) avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only) aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only) aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only) .sp Default value: \fBfastest\fR. .RE .sp .ne 2 .na \fBzfs_zevent_cols\fR (int) .ad .RS 12n When zevents are logged to the console use this as the word wrap width. .sp Default value: \fB80\fR. .RE .sp .ne 2 .na \fBzfs_zevent_console\fR (int) .ad .RS 12n Log events to the console .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_zevent_len_max\fR (int) .ad .RS 12n Max event queue length. A value of 0 will result in a calculated value which increases with the number of CPUs in the system (minimum 64 events). Events in the queue can be viewed with the \fBzpool events\fR command. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_maxalloc\fR (int) .ad .RS 12n The maximum number of taskq entries that are allowed to be cached. When this limit is exceeded transaction records (itxs) will be cleaned synchronously. .sp Default value: \fB1048576\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_minalloc\fR (int) .ad .RS 12n The number of taskq entries that are pre-populated when the taskq is first created and are immediately available for use. .sp Default value: \fB1024\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_nthr_pct\fR (int) .ad .RS 12n This controls the number of threads used by the dp_zil_clean_taskq. The default value of 100% will create a maximum of one thread per cpu. .sp Default value: \fB100\fR%. .RE .sp .ne 2 .na \fBzil_maxblocksize\fR (int) .ad .RS 12n This sets the maximum block size used by the ZIL. On very fragmented pools, lowering this (typically to 36KB) can improve performance. .sp Default value: \fB131072\fR (128KB). .RE .sp .ne 2 .na \fBzil_nocacheflush\fR (int) .ad .RS 12n Disable the cache flush commands that are normally sent to the disk(s) by the ZIL after an LWB write has completed. Setting this will cause ZIL corruption on power loss if a volatile out-of-order write cache is enabled. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzil_replay_disable\fR (int) .ad .RS 12n Disable intent logging replay. Can be disabled for recovery from corrupted ZIL .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzil_slog_bulk\fR (ulong) .ad .RS 12n Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. .sp Default value: \fB786,432\fR. .RE .sp .ne 2 .na \fBzio_deadman_log_all\fR (int) .ad .RS 12n If non-zero, the zio deadman will produce debugging messages (see \fBzfs_dbgmsg_enable\fR) for all zios, rather than only for leaf zios possessing a vdev. This is meant to be used by developers to gain diagnostic information for hang conditions which don't involve a mutex or other locking primitive; typically conditions in which a thread in the zio pipeline is looping indefinitely. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzio_decompress_fail_fraction\fR (int) .ad .RS 12n If non-zero, this value represents the denominator of the probability that zfs should induce a decompression failure. For instance, for a 5% decompression failure rate, this value should be set to 20. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzio_slow_io_ms\fR (int) .ad .RS 12n When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow I/O counters can be seen with "zpool status -s". .sp Default value: \fB30,000\fR. .RE .sp .ne 2 .na \fBzio_dva_throttle_enabled\fR (int) .ad .RS 12n Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by \fBzfs_vdev_queue_depth_pct\fR. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzio_requeue_io_start_cut_in_line\fR (int) .ad .RS 12n Prioritize requeued I/O .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzio_taskq_batch_pct\fR (uint) .ad .RS 12n Percentage of online CPUs (or CPU cores, etc) which will run a worker thread for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .sp The default value of 75 was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when high compression is enabled. .sp Default value: \fB75\fR. .RE .sp .ne 2 .na \fBzvol_inhibit_dev\fR (uint) .ad .RS 12n Do not create zvol device nodes. This may slightly improve startup time on systems with a very large number of zvols. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzvol_major\fR (uint) .ad .RS 12n Major number for zvol block devices .sp Default value: \fB230\fR. .RE .sp .ne 2 .na \fBzvol_max_discard_blocks\fR (ulong) .ad .RS 12n Discard (aka TRIM) operations done on zvols will be done in batches of this many blocks, where block size is determined by the \fBvolblocksize\fR property of a zvol. .sp Default value: \fB16,384\fR. .RE .sp .ne 2 .na \fBzvol_prefetch_bytes\fR (uint) .ad .RS 12n When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR from the start and end of the volume. Prefetching these regions of the volume is desirable because they are likely to be accessed immediately by \fBblkid(8)\fR or by the kernel scanning for a partition table. .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzvol_request_sync\fR (uint) .ad .RS 12n When processing I/O requests for a zvol submit them synchronously. This effectively limits the queue depth to 1 for each I/O submitter. When set to 0 requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controller by \fBzvol_threads\fR. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzvol_threads\fR (uint) .ad .RS 12n Max number of threads which can handle zvol I/O requests concurrently. .sp Default value: \fB32\fR. .RE .sp .ne 2 .na \fBzvol_volmode\fR (uint) .ad .RS 12n Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR. Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none). .sp Default value: \fB1\fR. .RE .SH ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. The I/O scheduler determines when and in what order those operations are issued. The I/O scheduler divides operations into five I/O classes prioritized in the following order: sync read, sync write, async read, async write, and scrub/resilver. Each queue defines the minimum and maximum number of concurrent operations that may be issued to the device. In addition, the device has an aggregate maximum, \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums must not exceed the aggregate maximum. If the sum of the per-queue maximums exceeds the aggregate maximum, then the number of active I/Os may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will be issued regardless of whether all per-queue minimums have been met. .sp For many physical devices, throughput increases with the number of concurrent operations, but latency typically suffers. Further, physical devices typically have a limit at which more concurrent operations have no effect on throughput or can actually cause it to decrease. .sp The scheduler selects the next operation to issue by first looking for an I/O class whose minimum has not been satisfied. Once all are satisfied and the aggregate maximum has not been hit, the scheduler looks for classes whose maximum has not been satisfied. Iteration through the I/O classes is done in the order specified above. No further operations are issued if the aggregate maximum number of concurrent operations has been hit or if there are no operations queued for an I/O class that has not hit its maximum. Every time an I/O is queued or an operation completes, the I/O scheduler looks for new operations to issue. .sp In general, smaller max_active's will lead to lower latency of synchronous operations. Larger max_active's may lead to higher overall throughput, depending on underlying storage. .sp The ratio of the queues' max_actives determines the balance of performance between reads, writes, and scrubs. E.g., increasing \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete more quickly, but reads and writes to have higher latency and lower throughput. .sp All I/O classes have a fixed maximum number of outstanding operations except for the async write class. Asynchronous writes represent the data that is committed to stable storage during the syncing stage for transaction groups. Transaction groups enter the syncing state periodically so the number of queued async writes will quickly burst up and then bleed down to zero. Rather than servicing them as quickly as possible, the I/O scheduler changes the maximum number of active async write I/Os according to the amount of dirty data in the pool. Since both throughput and latency typically increase with the number of concurrent operations issued to physical devices, reducing the burstiness in the number of concurrent operations also stabilizes the response time of operations from other -- and in particular synchronous -- queues. In broad strokes, the I/O scheduler will issue more concurrent operations from the async write queue as there's more dirty data in the pool. .sp Async Writes .sp The number of concurrent operations issued for the async write I/O class follows a piece-wise linear function defined by a few adjustable points. .nf | o---------| <-- zfs_vdev_async_write_max_active ^ | /^ | | | / | | active | / | | I/O | / | | count | / | | | / | | |-------o | | <-- zfs_vdev_async_write_min_active 0|_______^______|_________| 0% | | 100% of zfs_dirty_data_max | | | `-- zfs_vdev_async_write_active_max_dirty_percent `--------- zfs_vdev_async_write_active_min_dirty_percent .fi Until the amount of dirty data exceeds a minimum percentage of the dirty data allowed in the pool, the I/O scheduler will limit the number of concurrent operations to the minimum. As that threshold is crossed, the number of concurrent operations issued increases linearly to the maximum at the specified maximum percentage of the dirty data allowed in the pool. .sp Ideally, the amount of dirty data on a busy pool will stay in the sloped part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the maximum percentage, this indicates that the rate of incoming data is greater than the rate that the backend storage can handle. In this case, we must further throttle incoming writes, as described in the next section. .SH ZFS TRANSACTION DELAY We delay transactions when we've determined that the backend storage isn't able to accommodate the rate of incoming writes. .sp If there is already a transaction waiting, we delay relative to when that transaction will finish waiting. This way the calculated delay time is independent of the number of threads concurrently executing transactions. .sp If we are the only waiter, wait relative to when the transaction started, rather than the current time. This credits the transaction for "time already served", e.g. reading indirect blocks. .sp The minimum time for a transaction to take is calculated as: .nf min_time = zfs_delay_scale * (dirty - min) / (max - dirty) min_time is then capped at 100 milliseconds. .fi .sp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to delay after writing at full speed has failed to keep up with the incoming write rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. .sp .nf delay 10ms +-------------------------------------------------------------*+ | *| 9ms + *+ | *| 8ms + *+ | * | 7ms + * + | * | 6ms + * + | * | 5ms + * + | * | 4ms + * + | * | 3ms + * + | * | 2ms + (midpoint) * + | | ** | 1ms + v *** + | zfs_delay_scale ----------> ******** | 0 +-------------------------------------*********----------------+ 0% <- zfs_dirty_data_max -> 100% .fi .sp Note that since the delay is added to the outstanding time remaining on the most recent transaction, the delay is effectively the inverse of IOPS. Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve was chosen such that small changes in the amount of accumulated dirty data in the first 3/4 of the curve yield relatively small differences in the amount of delay. .sp The effects can be easier to understand when the amount of delay is represented on a log scale: .sp .nf delay 100ms +-------------------------------------------------------------++ + + | | + *+ 10ms + *+ + ** + | (midpoint) ** | + | ** + 1ms + v **** + + zfs_delay_scale ----------> ***** + | **** | + **** + 100us + ** + + * + | * | + * + 10us + * + + + | | + + +--------------------------------------------------------------+ 0% <- zfs_dirty_data_max -> 100% .fi .sp Note here that only as the amount of dirty data approaches its limit does the delay start to increase rapidly. The goal of a properly tuned system should be to keep the amount of dirty data out of that range by first ensuring that the appropriate limits are set for the I/O scheduler to reach optimal throughput on the backend storage, and then by changing the value of \fBzfs_delay_scale\fR to increase the steepness of the curve. diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 02b91378072a..9a9a5e0cf8a2 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1,5620 +1,5715 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #define WITH_DF_BLOCK_ALLOCATOR #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal * operation, we will try to write this amount of data to a top-level vdev * before moving on to the next one. */ unsigned long metaslab_aliquot = 512 << 10; /* * For testing, make some blocks above a certain size be gang blocks. */ unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ int zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ int zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmenation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ int zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ int zfs_metaslab_fragmentation_threshold = 70; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = 0; /* * When set will prevent metaslabs from being unloaded. */ int metaslab_debug_unload = 0; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (zfs_refcount_t), KM_SLEEP); mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { ASSERT(mc->mc_rotor == NULL); ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) zfs_refcount_destroy(&mc->mc_alloc_slots[i]); kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * sizeof (zfs_refcount_t)); kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_rotor); return (0); } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (AVL_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = AVL_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (AVL_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (zfs_refcount_t), KM_SLEEP); mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); kmem_free(mg->mg_secondaries, mg->mg_allocators * sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < mg->mg_allocators; i++) { zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * sizeof (zfs_refcount_t)); kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * sizeof (uint64_t)); kmem_free(mg, sizeof (metaslab_group_t)); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } mc->mc_rotor = mg; } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(mg->mg_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_t *msp = mg->mg_primaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mg->mg_secondaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mc->mc_rotor = NULL; } else { mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } mg->mg_prev = NULL; mg->mg_next = NULL; } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; vdev_t *vd = mg->mg_vd; uint64_t ashift = vd->vdev_ashift; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) continue; for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; if (msp->ms_group != mg) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if ((mc != spa_normal_class(spa) && mc != spa_special_class(spa) && mc != spa_dedup_class(spa)) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mg_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. */ qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count( &mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's * the only allocatable metasable group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count( &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree. Tree is sorted * by size, larger sizes at the end of the tree. */ static int metaslab_rangesize_compare(const void *x1, const void *x2) { const range_seg_t *r1 = x1; const range_seg_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = AVL_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); return (AVL_CMP(r1->rs_start, r2->rs_start)); } /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_block_maxsize(metaslab_t *msp) +metaslab_largest_allocatable(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); + if (t == NULL) + return (0); + rs = avl_last(t); + if (rs == NULL) + return (0); return (rs->rs_end - rs->rs_start); } +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of innacuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs->rs_start; + uint64_t rsize = rs->rs_end - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); +} + static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { range_seg_t *rs, rsearch; avl_index_t where; rsearch.rs_start = start; rsearch.rs_end = start + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL) { rs = avl_nearest(t, where, AVL_AFTER); } return (rs); } #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) /* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t max_search) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); uint64_t first_found; if (rs != NULL) first_found = rs->rs_start; while (rs != NULL && rs->rs_start - first_found <= max_search) { uint64_t offset = rs->rs_start; if (offset + size <= rs->rs_end) { *cursor = offset + size; return (offset); } rs = AVL_NEXT(t, rs); } *cursor = 0; return (-1ULL); } #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ #if defined(WITH_DF_BLOCK_ALLOCATOR) /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(&rt->rt_root), ==, avl_numnodes(&msp->ms_allocatable_by_size)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ - if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { offset = metaslab_block_picker(&rt->rt_root, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = avl_last(&msp->ms_allocatable_by_size); } else { /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, 0, size); } if (rs != NULL && rs->rs_start + size <= rs->rs_end) { offset = rs->rs_start; *cursor = offset + size; } } return (offset); } static metaslab_ops_t metaslab_df_ops = { metaslab_df_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; #endif /* WITH_DF_BLOCK_ALLOCATOR */ #if defined(WITH_CF_BLOCK_ALLOCATOR) /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); *cursor = rs->rs_start; *cursor_end = rs->rs_end; } offset = *cursor; *cursor += size; return (offset); } static metaslab_ops_t metaslab_cf_ops = { metaslab_cf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; #endif /* WITH_CF_BLOCK_ALLOCATOR */ #if defined(WITH_NDF_BLOCK_ALLOCATOR) /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { avl_tree_t *t = &msp->ms_allocatable->rt_root; avl_index_t where; range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); + uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); rsearch.rs_start = *cursor; rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; rsearch.rs_end = MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift)); rs = avl_find(t, &rsearch, &where); if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); } if ((rs->rs_end - rs->rs_start) >= size) { *cursor = rs->rs_start + size; return (rs->rs_start); } return (-1ULL); } static metaslab_ops_t metaslab_ndf_ops = { metaslab_ndf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; #endif /* WITH_NDF_BLOCK_ALLOCATOR */ /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + range_tree_space(msp->ms_unflushed_allocs) - range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(msp->ms_deferspace, ==, range_tree_space(msp->ms_defer[0]) + range_tree_space(msp->ms_defer[1])); msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], sizeof (msp->ms_synchist)); } else { bzero(msp->ms_deferhist[hist_index], sizeof (msp->ms_deferhist[hist_index])); } bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; - msp->ms_max_size = 0; /* * This function is used for verification purposes. Regardless of * whether metaslab_weight() thinks this metaslab should be active or * not, we want to ensure that the actual weight (and therefore the * value of ms_weight) would be the same if it was to be recalculated * at this point. */ msp->ms_weight = metaslab_weight(msp) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); } else { /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_freed != NULL) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. * * Note: ms_freed and all the other trees except for * the ms_allocatable, can be NULL at this point only * if this is a new metaslab of a vdev that just got * expanded. */ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ range_tree_walk(msp->ms_unflushed_allocs, range_tree_remove, msp->ms_allocatable); range_tree_walk(msp->ms_unflushed_frees, range_tree_add, msp->ms_allocatable); msp->ms_loaded = B_TRUE; ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ range_tree_walk(msp->ms_freed, range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); - + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, " - "loading_time %lld ms", + "loading_time %lld ms, ms_max_size %llu, " + "max size error %llu", spa_syncing_txg(spa), spa_name(spa), msp->ms_group->mg_vd->vdev_id, msp->ms_id, space_map_length(msp->ms_sm), range_tree_space(msp->ms_unflushed_allocs), range_tree_space(msp->ms_unflushed_frees), range_tree_space(msp->ms_freed), range_tree_space(msp->ms_defer[0]), range_tree_space(msp->ms_defer[1]), - (longlong_t)((load_end - load_start) / 1000000)); + (longlong_t)((load_end - load_start) / 1000000), + msp->ms_max_size, msp->ms_max_size - max_size); } metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_verify_weight_and_frag(msp); range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } /* * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the * addition of new space; and for debugging, it ensures that * we'd data fault on any attempt to use this metaslab before * it's ready. */ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); ms->ms_trim = range_tree_create(NULL, NULL); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * sizeof (range_seg_t)); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_destroy(msp->ms_unflushed_allocs); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); range_tree_destroy(msp->ms_checkpointing); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); range_tree_vacate(msp->ms_trim, NULL, NULL); range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } #define FRAGMENTATION_TABLE_SIZE 17 /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. A 10% change in fragmentation equates to approximately * double the number of segments. * * This table defines 0% fragmented space using 16MB segments. Testing has * shown that segments that are greater than or equal to 16MB do not suffer * from drastic performance problems. Using this value, we derive the rest * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ 95, /* 4K */ 90, /* 8K */ 80, /* 16K */ 70, /* 32K */ 60, /* 64K */ 50, /* 128K */ 40, /* 256K */ 30, /* 512K */ 20, /* 1M */ 15, /* 2M */ 10, /* 4M */ 5, /* 8M */ 0 /* 16M */ }; /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. */ if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", txg, msp->ms_id, vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { - if (msp->ms_loaded) { + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); - } else { - ASSERT0(msp->ms_max_size); - } boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If this vdev is in the process of being removed, there is nothing * for us to do here. */ if (vd->vdev_removing) return (0); metaslab_set_fragmentation(msp); /* - * Update the maximum size if the metaslab is loaded. This will + * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) return (0); metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? mg->mg_primaries : mg->mg_secondaries); mutex_enter(&mg->mg_lock); if (arr[allocator] != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } arr[allocator] = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); if (msp->ms_primary) { ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mg->mg_primaries[msp->ms_allocator] = NULL; } else { ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mg->mg_secondaries[msp->ms_allocator] = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE); /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhaused the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; spa_t *spa = msp->ms_group->mg_vd->vdev_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) { taskq_wait_outstanding(mg->mg_taskq, 0); return; } mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, msp, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (avl_is_empty(&msp->ms_allocatable_by_size) || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% allocated. * 2] We remove from it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Removing these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs as they should already exist * in the condense tree. Then we vacate any unflushed frees as * they should already be part of ms_allocatable. * 4] At this point, we would ideally like to remove all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain allocated * segments with everything else assumed to be freed. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to subtract * from the condense_tree. Instead we first sync out the * condense_tree and then the ms_allocatable, in the condensed * space map. While this is not optimal, it is typically close to * optimal and more importantly much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, spa->spa_name, space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* cleanup obsolete logs if any */ uint64_t log_blocks_before = spa_log_sm_nblocks(spa); spa_cleanup_old_sm_logs(spa, tx); uint64_t log_blocks_after = spa_log_sm_nblocks(spa); VERIFY3U(log_blocks_after, <=, log_blocks_before); /* update log space map summary */ uint64_t blocks_gone = log_blocks_before - log_blocks_after; spa_log_summary_add_flushed_metaslab(spa); spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_summary_decrement_blkcount(spa, blocks_gone); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), msp->ms_group->mg_vd->vdev_id, msp->ms_id, range_tree_space(msp->ms_unflushed_allocs), range_tree_space(msp->ms_unflushed_frees), (sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_freeing == NULL) { ASSERT3P(alloctree, ==, NULL); return; } ASSERT3P(alloctree, !=, NULL); ASSERT3P(msp->ms_freeing, !=, NULL); ASSERT3P(msp->ms_freed, !=, NULL); ASSERT3P(msp->ms_checkpointing, !=, NULL); ASSERT3P(msp->ms_trim, !=, NULL); /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense and it's loaded, we need to let it through. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted)) return; VERIFY(txg <= spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (metaslab_unflushed_txg(msp) == 0 && spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { ASSERT(spa_syncing_log_sm(spa) != NULL); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa); ASSERT(msp->ms_sm != NULL); mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); } if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } void metaslab_potentially_unload(metaslab_t *msp, uint64_t txg) { /* * If the metaslab is loaded and we've not tried to load or allocate * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && msp->ms_disabled == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } if (!metaslab_debug_unload) metaslab_unload(msp); } } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); /* * If this metaslab is just becoming available, initialize its * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); msp->ms_allocating[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_freeing, ==, NULL); msp->ms_freeing = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_freed, ==, NULL); msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT3P(msp->ms_defer[t], ==, NULL); msp->ms_defer[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, + &msp->ms_unflushed_frees_by_size, + metaslab_rangesize_compare, 0); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); if (!defer_allowed) { range_tree_walk(msp->ms_freed, range_tree_add, msp->ms_trim); } } else { range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { range_tree_swap(&msp->ms_freed, defer_tree); } else { range_tree_vacate(msp->ms_freed, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ #ifdef _METASLAB_TRACING kstat_t *metaslab_trace_ksp; kstat_named_t metaslab_trace_over_limit; void metaslab_alloc_trace_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); if (metaslab_trace_ksp != NULL) { metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; kstat_named_init(&metaslab_trace_over_limit, "metaslab_trace_over_limit", KSTAT_DATA_UINT64); kstat_install(metaslab_trace_ksp); } } void metaslab_alloc_trace_fini(void) { if (metaslab_trace_ksp != NULL) { kstat_delete(metaslab_trace_ksp); metaslab_trace_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef DEBUG panic("too many entries in allocation list"); #endif atomic_inc_64(&metaslab_trace_over_limit.value.ui64); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } #else #define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) void metaslab_alloc_trace_init(void) { } void metaslab_alloc_trace_fini(void) { } void metaslab_trace_init(zio_alloc_list_t *zal) { } void metaslab_trace_fini(zio_alloc_list_t *zal) { } #endif /* _METASLAB_TRACING */ /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; while (cur < max) { if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], cur, cur + 1) == cur) { atomic_inc_64( &mg->mg_class->mc_alloc_max_slots[allocator]); return; } cur = mg->mg_cur_max_alloc_queue_depth[allocator]; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; VERIFY(zfs_refcount_not_held( &mg->mg_alloc_queue_depth[allocator], tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); range_tree_clear(msp->ms_trim, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, * skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; for (i = 0; i < d; i++) { if (want_unique && !metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mg->mg_primaries[allocator] != NULL) { msp = mg->mg_primaries[allocator]; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus is is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mg->mg_secondaries[allocator] != NULL) { msp = mg->mg_secondaries[allocator]; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); + want_unique, asize, allocator, try_hard, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to reselect * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } msp->ms_selected_txg = txg; int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_block_maxsize(msp); + weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); + dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) mg = mg->mg_next; } else { mg = mc->mc_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else if (flags & METASLAB_FASTWRITE) { mg = fast_mg = mc->mc_rotor; do { if (fast_mg->mg_vd->vdev_pending_fastwrite < mg->mg_vd->vdev_pending_fastwrite) mg = fast_mg; } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); } else { ASSERT(mc->mc_rotor != NULL); mg = mc->mc_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, psize, allocator, d); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } ASSERT(mg->mg_initialized); /* * Avoid writing single-copy data to a failing, * non-redundant vdev, unless we've already tried all * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* * If we don't need to try hard, then require that the * block be on an different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); + !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. * * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; int64_t ratio; /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * * This basically introduces a zero-centered * bias towards the devices with the most * free space, while compensating for vdev * size differences. * * Examples: * vdev V1 = 16M/128M * vdev V2 = 16M/128M * ratio(V1) = 100% ratio(V2) = 100% * * vdev V1 = 16M/128M * vdev V2 = 64M/128M * ratio(V1) = 127% ratio(V2) = 72% * * vdev V1 = 16M/128M * vdev V2 = 64M/512M * ratio(V1) = 40% ratio(V2) = 160% */ ratio = (vs_free * mc->mc_alloc_groups * 100) / (mc_free + 1); mg->mg_bias = ((ratio - 100) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if ((flags & METASLAB_FASTWRITE) || atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); if (flags & METASLAB_FASTWRITE) { atomic_add_64(&vd->vdev_pending_fastwrite, psize); } return (0); } next: mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, do so now. */ if (!try_hard) { try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); range_tree_add(msp->ms_checkpointing, offset, asize); } else { range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } /* ARGSUSED */ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); uint64_t reserved_slots = zfs_refcount_count(&mc->mc_alloc_slots[allocator]); if (reserved_slots < max) available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags) || flags & METASLAB_MUST_RESERVE) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { reserved_slots = zfs_refcount_add(&mc->mc_alloc_slots[allocator], zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; } mutex_exit(&mc->mc_lock); return (slot_reserved); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], zio); } mutex_exit(&mc->mc_lock); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; /* ARGSUSED */ static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(1M) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT(bp->blk_birth == 0); ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (bp->blk_birth <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); uint64_t psize = BP_GET_PSIZE(bp); int d; vdev_t *vd; ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(psize > 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (d = 0; d < ndvas; d++) { if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) continue; atomic_add_64(&vd->vdev_pending_fastwrite, psize); } spa_config_exit(spa, SCL_VDEV, FTAG); } void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); uint64_t psize = BP_GET_PSIZE(bp); int d; vdev_t *vd; ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(psize > 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (d = 0; d < ndvas; d++) { if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) continue; ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); atomic_sub_64(&vd->vdev_pending_fastwrite, psize); } spa_config_exit(spa, SCL_VDEV, FTAG); } /* ARGSUSED */ static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; ASSERTV(spa_t *spa = vd->vdev_spa); if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass withing the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call range_tree_verify() in the * current allocating tree. */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { spa_t *spa = ms->ms_group->mg_vd->vdev_spa; if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } #if defined(_KERNEL) /* BEGIN CSTYLED */ module_param(metaslab_aliquot, ulong, 0644); MODULE_PARM_DESC(metaslab_aliquot, "allocation granularity (a.k.a. stripe size)"); module_param(metaslab_debug_load, int, 0644); MODULE_PARM_DESC(metaslab_debug_load, "load all metaslabs when pool is first opened"); module_param(metaslab_debug_unload, int, 0644); MODULE_PARM_DESC(metaslab_debug_unload, "prevent metaslabs from being unloaded"); module_param(metaslab_preload_enabled, int, 0644); MODULE_PARM_DESC(metaslab_preload_enabled, "preload potential metaslabs during reassessment"); module_param(zfs_mg_noalloc_threshold, int, 0644); MODULE_PARM_DESC(zfs_mg_noalloc_threshold, "percentage of free space for metaslab group to allow allocation"); module_param(zfs_mg_fragmentation_threshold, int, 0644); MODULE_PARM_DESC(zfs_mg_fragmentation_threshold, "fragmentation for metaslab group to allow allocation"); module_param(zfs_metaslab_fragmentation_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold, "fragmentation for metaslab to allow allocation"); module_param(metaslab_fragmentation_factor_enabled, int, 0644); MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled, "use the fragmentation metric to prefer less fragmented metaslabs"); module_param(metaslab_lba_weighting_enabled, int, 0644); MODULE_PARM_DESC(metaslab_lba_weighting_enabled, "prefer metaslabs with lower LBAs"); module_param(metaslab_bias_enabled, int, 0644); MODULE_PARM_DESC(metaslab_bias_enabled, "enable metaslab group biasing"); module_param(zfs_metaslab_segment_weight_enabled, int, 0644); MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled, "enable segment-based metaslab selection"); module_param(zfs_metaslab_switch_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_switch_threshold, "segment-based metaslab selection maximum buckets before switching"); module_param(metaslab_force_ganging, ulong, 0644); MODULE_PARM_DESC(metaslab_force_ganging, "blocks larger than this size are forced to be gang blocks"); module_param(metaslab_df_max_search, int, 0644); MODULE_PARM_DESC(metaslab_df_max_search, "max distance (bytes) to search forward before using size tree"); module_param(metaslab_df_use_largest_segment, int, 0644); MODULE_PARM_DESC(metaslab_df_use_largest_segment, "when looking in size tree, use largest segment instead of exact fit"); + +module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644); +MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec, + "how long to trust the cached max chunk size of a metaslab"); /* END CSTYLED */ #endif diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 5919236d9a76..0e12972148bf 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -1,745 +1,775 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include /* * Range trees are tree-based data structures that can be used to * track free space or generally any space allocation information. * A range tree keeps track of individual segments and automatically * provides facilities such as adjacent extent merging and extent * splitting in response to range add/remove requests. * * A range tree starts out completely empty, with no segments in it. * Adding an allocation via range_tree_add to the range tree can either: * 1) create a new extent * 2) extend an adjacent extent * 3) merge two adjacent extents * Conversely, removing an allocation via range_tree_remove can: * 1) completely remove an extent * 2) shorten an extent (if the allocation was near one of its ends) * 3) split an extent into two extents, in effect punching a hole * * A range tree is also capable of 'bridging' gaps when adding * allocations. This is useful for cases when close proximity of * allocations is an important detail that needs to be represented * in the range tree. See range_tree_set_gap(). The default behavior * is not to bridge gaps (i.e. the maximum allowed gap size is 0). * * In order to traverse a range tree, use either the range_tree_walk() * or range_tree_vacate() functions. * * To obtain more accurate information on individual segment * operations that the range tree performs "under the hood", you can * specify a set of callbacks by passing a range_tree_ops_t structure * to the range_tree_create function. Any callbacks that are non-NULL * are then called at the appropriate times. * * The range tree code also supports a special variant of range trees * that can bridge small gaps between segments. This kind of tree is used * by the dsl scanning code to group I/Os into mostly sequential chunks to * optimize disk performance. The code here attempts to do this with as * little memory and computational overhead as possible. One limitation of * this implementation is that segments of range trees with gaps can only * support removing complete segments. */ kmem_cache_t *range_seg_cache; /* Generic ops for managing an AVL tree alongside a range tree */ struct range_tree_ops rt_avl_ops = { .rtop_create = rt_avl_create, .rtop_destroy = rt_avl_destroy, .rtop_add = rt_avl_add, .rtop_remove = rt_avl_remove, .rtop_vacate = rt_avl_vacate, }; void range_tree_init(void) { ASSERT(range_seg_cache == NULL); range_seg_cache = kmem_cache_create("range_seg_cache", sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void range_tree_fini(void) { kmem_cache_destroy(range_seg_cache); range_seg_cache = NULL; } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = avl_first(&rt->rt_root); rs != NULL; rs = AVL_NEXT(&rt->rt_root, rs)) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; hist[idx]++; ASSERT3U(hist[idx], !=, 0); } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", i, hist, hist[i], rt->rt_histogram[i]); } VERIFY3U(hist[i], ==, rt->rt_histogram[i]); } } static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); rt->rt_histogram[idx]++; ASSERT3U(rt->rt_histogram[idx], !=, 0); } static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); ASSERT3U(rt->rt_histogram[idx], !=, 0); rt->rt_histogram[idx]--; } /* * NOTE: caller is responsible for all locking. */ static int range_tree_seg_compare(const void *x1, const void *x2) { const range_seg_t *r1 = (const range_seg_t *)x1; const range_seg_t *r2 = (const range_seg_t *)x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } range_tree_t * range_tree_create_impl(range_tree_ops_t *ops, void *arg, int (*avl_compare) (const void *, const void *), uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; rt->rt_avl_compare = avl_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } range_tree_t * range_tree_create(range_tree_ops_t *ops, void *arg) { return (range_tree_create_impl(ops, arg, NULL, 0)); } void range_tree_destroy(range_tree_t *rt) { VERIFY0(rt->rt_space); if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); avl_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { ASSERT3U(rs->rs_fill + delta, !=, 0); ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); rs->rs_fill += delta; if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } static void range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs_before, *rs_after, *rs; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); if (gap == 0 && rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { zfs_panic_recover("zfs: allocating allocated segment" "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", (longlong_t)start, (longlong_t)size, (longlong_t)rs->rs_start, (longlong_t)rs->rs_end - rs->rs_start); return; } /* * If this is a gap-supporting range tree, it is possible that we * are inserting into an existing segment. In this case simply * bump the fill count and call the remove / add callbacks. If the * new range will extend an existing segment, we remove the * existing one, apply the new extent to it and re-insert it using * the normal code paths. */ if (rs != NULL) { ASSERT3U(gap, !=, 0); if (rs->rs_start <= start && rs->rs_end >= end) { range_tree_adjust_fill(rt, rs, fill); return; } avl_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); range_tree_stat_decr(rt, rs); rt->rt_space -= rs->rs_end - rs->rs_start; fill += rs->rs_fill; start = MIN(start, rs->rs_start); end = MAX(end, rs->rs_end); size = end - start; range_tree_add_impl(rt, start, size, fill); kmem_cache_free(range_seg_cache, rs); return; } ASSERT3P(rs, ==, NULL); /* * Determine whether or not we will have to merge with our neighbors. * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); if (merge_before && gap != 0) bridge_size += start - rs_before->rs_end; if (merge_after && gap != 0) bridge_size += rs_after->rs_start - end; if (merge_before && merge_after) { avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); rs_after->rs_fill += rs_before->rs_fill + fill; rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); range_tree_stat_decr(rt, rs_before); rs_before->rs_fill += fill; rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); range_tree_stat_decr(rt, rs_after); rs_after->rs_fill += fill; rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); rs->rs_fill = fill; rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } if (gap != 0) ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); else ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); range_tree_stat_incr(rt, rs); rt->rt_space += size + bridge_size; } void range_tree_add(void *arg, uint64_t start, uint64_t size) { range_tree_add_impl(arg, start, size, size); } static void range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { avl_index_t where; range_seg_t rsearch, *rs, *newseg; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { zfs_panic_recover("zfs: freeing free segment " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size); return; } /* * Range trees with gap support must only remove complete segments * from the tree. This allows us to maintain accurate fill accounting * and to ensure that bridged sections are not leaked. If we need to * remove less than the full segment, we can only adjust the fill count. */ if (rt->rt_gap != 0) { if (do_fill) { if (rs->rs_fill == size) { start = rs->rs_start; end = rs->rs_end; size = end - start; } else { range_tree_adjust_fill(rt, rs, -size); return; } } else if (rs->rs_start != start || rs->rs_end != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llu size=%llu) of " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size, (longlong_t)rs->rs_start, (longlong_t)rs->rs_end - rs->rs_start); return; } } VERIFY3U(rs->rs_start, <=, start); VERIFY3U(rs->rs_end, >=, end); left_over = (rs->rs_start != start); right_over = (rs->rs_end != end); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); newseg->rs_start = end; newseg->rs_end = rs->rs_end; newseg->rs_fill = newseg->rs_end - newseg->rs_start; range_tree_stat_incr(rt, newseg); rs->rs_end = start; avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); } else if (left_over) { rs->rs_end = start; } else if (right_over) { rs->rs_start = end; } else { avl_remove(&rt->rt_root, rs); kmem_cache_free(range_seg_cache, rs); rs = NULL; } if (rs != NULL) { /* * The fill of the leftover segment will always be equal to * the size, since we do not support removing partial segments * of range trees with gaps. */ rs->rs_fill = rs->rs_end - rs->rs_start; range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } rt->rt_space -= size; } void range_tree_remove(void *arg, uint64_t start, uint64_t size) { range_tree_remove_impl(arg, start, size, B_FALSE); } void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) { range_tree_remove_impl(rt, start, size, B_TRUE); } void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize) { int64_t delta = newsize - (rs->rs_end - rs->rs_start); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); rs->rs_start = newstart; rs->rs_end = newstart + newsize; range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); rt->rt_space += delta; } static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t rsearch; uint64_t end = start + size; VERIFY(size != 0); rsearch.rs_start = start; rsearch.rs_end = end; return (avl_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs = range_tree_find_impl(rt, start, size); if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) return (rs); return (NULL); } void range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) { range_seg_t *rs = range_tree_find(rt, off, size); if (rs != NULL) panic("segment already in tree; rs=%p", (void *)rs); } boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) { return (range_tree_find(rt, start, size) != NULL); } +/* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + range_seg_t rsearch; + rsearch.rs_start = start; + rsearch.rs_end = start + 1; + + avl_index_t where; + range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs->rs_end - start); + return (B_TRUE); + } + + rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); + if (rs == NULL || rs->rs_start > start + size) + return (B_FALSE); + + *ostart = rs->rs_start; + *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + return (B_TRUE); +} + /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. */ void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs; if (size == 0) return; while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { uint64_t free_start = MAX(rs->rs_start, start); uint64_t free_end = MIN(rs->rs_end, start + size); range_tree_remove(rt, free_start, free_end - free_start); } } void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) { range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; *rtdst = rt; } void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { range_seg_t *rs; void *cookie = NULL; if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { if (func != NULL) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); kmem_cache_free(range_seg_cache, rs); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); rt->rt_space = 0; } void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { for (range_seg_t *rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) { func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } } range_seg_t * range_tree_first(range_tree_t *rt) { return (avl_first(&rt->rt_root)); } uint64_t range_tree_space(range_tree_t *rt) { return (rt->rt_space); } uint64_t range_tree_numsegs(range_tree_t *rt) { return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root)); } boolean_t range_tree_is_empty(range_tree_t *rt) { ASSERT(rt != NULL); return (range_tree_space(rt) == 0); } /* Generic range tree functions for maintaining segments in an AVL tree. */ void rt_avl_create(range_tree_t *rt, void *arg) { avl_tree_t *tree = arg; avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); } void rt_avl_destroy(range_tree_t *rt, void *arg) { avl_tree_t *tree = arg; ASSERT0(avl_numnodes(tree)); avl_destroy(tree); } void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) { avl_tree_t *tree = arg; avl_add(tree, rs); } void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { avl_tree_t *tree = arg; avl_remove(tree, rs); } void rt_avl_vacate(range_tree_t *rt, void *arg) { /* * Normally one would walk the tree freeing nodes along the way. * Since the nodes are shared with the range trees we can avoid * walking all nodes and just reinitialize the avl tree. The nodes * will be freed by the range tree, so we don't want to free them here. */ rt_avl_create(rt, arg); } uint64_t range_tree_min(range_tree_t *rt) { range_seg_t *rs = avl_first(&rt->rt_root); return (rs != NULL ? rs->rs_start : 0); } uint64_t range_tree_max(range_tree_t *rt) { range_seg_t *rs = avl_last(&rt->rt_root); return (rs != NULL ? rs->rs_end : 0); } uint64_t range_tree_span(range_tree_t *rt) { return (range_tree_max(rt) - range_tree_min(rt)); } /* * Remove any overlapping ranges between the given segment [start, end) * from removefrom. Add non-overlapping leftovers to addto. */ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto) { avl_index_t where; range_seg_t starting_rs = { .rs_start = start, .rs_end = start + 1 }; range_seg_t *curr = avl_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER); range_seg_t *next; for (; curr != NULL; curr = next) { next = AVL_NEXT(&removefrom->rt_root, curr); if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ if (end <= curr->rs_start) { range_tree_add(addto, start, end - start); return; } uint64_t overlap_start = MAX(curr->rs_start, start); uint64_t overlap_end = MIN(curr->rs_end, end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) range_tree_add(addto, start, overlap_start - start); start = overlap_end; } VERIFY3P(curr, ==, NULL); if (start != end) { VERIFY3U(start, <, end); range_tree_add(addto, start, end - start); } else { VERIFY3U(start, ==, end); } } /* * For each entry in rt, if it exists in removefrom, remove it * from removefrom. Otherwise, add it to addto. */ void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto) { for (range_seg_t *rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) { range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end, removefrom, addto); } }