Index: vendor/illumos/dist/cmd/zdb/zdb.c =================================================================== --- vendor/illumos/dist/cmd/zdb/zdb.c (revision 354382) +++ vendor/illumos/dist/cmd/zdb/zdb.c (revision 354383) @@ -1,5685 +1,5700 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright 2017 RackTop Systems. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef verify #include #include "zdb.h" #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ DMU_OT_ZAP_OTHER : \ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) #ifndef lint extern int reference_tracking_enable; extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern int aok; extern boolean_t spa_load_verify_dryrun; #else int reference_tracking_enable; boolean_t zfs_recover; uint64_t zfs_arc_max, zfs_arc_meta_limit; int zfs_vdev_async_read_max_active; int aok; boolean_t spa_load_verify_dryrun; #endif static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); uint64_t *zopt_object = NULL; static unsigned zopt_objects = 0; libzfs_handle_t *g_zfs; uint64_t max_inflight = 1000; static int leaked_objects = 0; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); static void mos_obj_refd(uint64_t); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init() { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[ [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] " "[ ...]\n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -E decode and display block from an " "embedded block pointer\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -k examine the checkpointed state " "of the pool\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " "others)\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " "exiting\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, " -o = set global " "variable to an unsigned 32-bit integer value\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -V do verbatim import\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } static void dump_debug_buffer() { if (dump_opt['G']) { (void) printf("\n"); zfs_dbgmsg_print("zdb"); } } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, sizeof (buf)); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { (void) printf("%s", (char *)prop); } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= sizeof (*bpop)) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (dump_opt['d'] < 5) return; for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); (void) printf("\t%s\n", blkbuf); } } /* ARGSUSED */ static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (int64_t i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { int refcount = 0; uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); if (vd->vdev_top == vd && obsolete_sm_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_obj, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", - (longlong_t)sm->sm_phys->smp_object); - (void) printf(" smp_objsize = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_objsize); + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; - uint64_t word; + uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", - (u_longlong_t)(offset / sizeof (word)), + (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)SM_DEBUG_TXG_DECODE(word), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + entry_id++; continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", - (u_longlong_t)(offset / sizeof (word)), + (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; + entry_id++; } - if ((uint64_t)alloc != space_map_allocated(sm)) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); - - dump_spacemap(spa->spa_meta_objset, msp->ms_sm); - } + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *bias_str; bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : vd->vdev_islog ? "log" : ""; (void) printf("\tvdev %10llu %s\n" "\t%-10s%5llu %-19s %-15s %-12s\n", (u_longlong_t)vd->vdev_id, bias_str, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", "---------------", "------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_objects > 0) { c = zopt_object[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_objects > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_objects; m++) { if (zopt_object[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_object[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_object[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; const char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); if ((count = ddt_object_count(ddt, type, class)) == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char buf[SPA_MAXBLOCKSIZE]; uint64_t resid, len, off = 0; uint_t num = 0; int error; time_t tsec; struct tm t; char tbuf[30]; char internalstr[MAXPATHLEN]; do { len = sizeof (buf); if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; } while (len != 0); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%ju] %s", zfs_history_event_names[ievent], (uintmax_t)txg, intstr); cmd = internalstr; } tsec = time; (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } } static void print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf(" %*s: object %llu, %llu local blkptrs, " "%llu subobjs in object %llu, %s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static void dump_deadlist(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; uint64_t empty_bpobj = dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; /* force the tree to be loaded */ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); } } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); (void) printf("\n Deadlist: %s (%s/%s comp)\n", bytes, comp, uncomp); if (dump_opt['d'] < 4) return; (void) printf("\n"); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } } } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); err = dmu_objset_own(path, type, B_TRUE, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, strerror(err)); return (err); } if (dmu_objset_type(*osp) == DMU_OST_ZFS) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dmu_objset_disown(*osp, tag); *osp = NULL; } } sa_os = *osp; return (0); } static void close_objset(objset_t *os, void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dmu_objset_disown(os, tag); sa_attr_table = NULL; sa_os = NULL; } static void fuid_table_destroy() { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group used */ dump_zap, /* ZFS user/group quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static void dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); if (dnode_slots_used != NULL) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10" PRIu64 " %3u %5s %5s %5s %5s %5s %6s %s%s\n", object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", ZDB_OT_NAME(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); *print_header = 1; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } if (db != NULL) dmu_buf_rele(db, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void dump_dir(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; int print_header = 1; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, &print_header, NULL); (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_deadlist(&ds->ds_deadlist); if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_deadlist(&ds->ds_remap_deadlist); } count_ds_mos_objects(ds); } if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header, NULL); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, NULL); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, NULL); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header, &dnode_slots); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; } ASSERT3U(object_count, ==, usedobjs); (void) printf("\n"); (void) printf(" Dnode slots:\n"); (void) printf("\tTotal used: %10llu\n", (u_longlong_t)total_slots_used); (void) printf("\tMax used: %10llu\n", (u_longlong_t)max_slot_used); (void) printf("\tPercent empty: %10lf\n", (double)(max_slot_used - total_slots_used)*100 / (double)max_slot_used); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (ub->ub_mmp_magic == MMP_MAGIC) (void) printf("\tmmp_delay = %0llu\n", (u_longlong_t)ub->ub_mmp_delay); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) { vdev_t vd; vdev_t *vdp = &vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vdp->vdev_top = vdp; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); uberblock_t *ub = (void *)((char *)lbl + uoff); if (uberblock_verify(ub)) continue; if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) continue; (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, "Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name) { int err; int header = 1; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: dump_object(os, child_obj, dump_opt['v'], &header, NULL); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); dmu_objset_disown(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path); close_objset(os, FTAG); return (err); } static int dump_label(const char *dev) { int fd; vdev_label_t label; char path[MAXPATHLEN]; char *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; uint64_t psize, ashift; boolean_t label_found = B_FALSE; (void) strlcpy(path, dev, sizeof (path)); if (dev[0] == '/') { if (strncmp(dev, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) { (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD)); } } else if (stat64(path, &statbuf) != 0) { char *s; (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev); if (((s = strrchr(dev, 's')) == NULL && (s = strchr(dev, 'p')) == NULL) || !isdigit(*(s + 1))) (void) strlcat(path, "s0", sizeof (path)); } if ((fd = open64(path, O_RDONLY)) < 0) { (void) fprintf(stderr, "cannot open '%s': %s\n", path, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) fprintf(stderr, "failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode)) { (void) fprintf(stderr, "cannot use '%s': character device required\n", path); (void) close(fd); exit(1); } psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *config = NULL; if (!dump_opt['q']) { (void) printf("------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("------------------------------------\n"); } if (pread64(fd, &label, sizeof (label), vdev_label_offset(psize, l, 0)) != sizeof (label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); continue; } if (nvlist_unpack(buf, buflen, &config, 0) != 0) { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); ashift = SPA_MINBLOCKSHIFT; } else { nvlist_t *vdev_tree = NULL; if (!dump_opt['q']) dump_nvlist(config, 4); if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; nvlist_free(config); label_found = B_TRUE; } if (dump_opt['u']) dump_label_uberblocks(&label, ashift); } (void) close(fd); return (label_found ? 0 : 2); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) { int error; objset_t *os; error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); if (error != 0) return (0); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!dmu_objset_ds(os)->ds_feature_inuse[f]) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } dump_dir(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ static boolean_t same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) { vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t ms_shift = vd->vdev_ms_shift; return ((off1 >> ms_shift) == (off2 >> ms_shift)); } static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; if (same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) { zb->zb_ditto_samevdev++; if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; } break; } } spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); } static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (bp == NULL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; int kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); int sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4dMB/s) " "estimated time remaining: %uhr %02umin %02usec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb; ddt_entry_t dde; int error; + ASSERT(!dump_opt['L']); + bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } - if (!dump_opt['L']) { - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); } ASSERT(error == ENOENT); } /* ARGSUSED */ static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { + if (dump_opt['L']) + return; + if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(svr->svr_allocd_segs)); if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); /* * Clear everything past what has been synced unless * it's past the spacemap, because we have not allocated * mappings for it yet. */ uint64_t vim_max_offset = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > vim_max_offset) range_tree_clear(svr->svr_allocd_segs, vim_max_offset, sm_end - vim_max_offset); } zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); } spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; - if (!dump_opt['L']) { - dsl_pool_t *dp = spa->spa_dsl_pool; - vdev_t *rvd = spa->spa_root_vdev; + if (dump_opt['L']) + return; - /* - * We are going to be changing the meaning of the metaslab's - * ms_allocatable. Ensure that the allocator doesn't try to - * use the tree. - */ - spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; - spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; - zcb->zcb_vd_obsolete_counts = - umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), - UMEM_NOFAIL); + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; - /* - * For leak detection, we overload the ms_allocatable trees - * to contain allocated segments instead of free segments. - * As a result, we can't use the normal metaslab_load/unload - * interfaces. - */ - zdb_leak_init_prepare_indirect_vdevs(spa, zcb); - load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); - /* - * On load_concrete_ms_allocatable_trees() we loaded all the - * allocated entries from the ms_sm to the ms_allocatable for - * each metaslab. If the pool has a checkpoint or is in the - * middle of discarding a checkpoint, some of these blocks - * may have been freed but their ms_sm may not have been - * updated because they are referenced by the checkpoint. In - * order to avoid false-positives during leak-detection, we - * go through the vdev's checkpoint space map and exclude all - * its entries from their relevant ms_allocatable. - * - * We also aggregate the space held by the checkpoint and add - * it to zcb_checkpoint_size. - * - * Note that at this point we are also verifying that all the - * entries on the checkpoint_sm are marked as allocated in - * the ms_sm of their relevant metaslab. - * [see comment in checkpoint_sm_exclude_entry_cb()] - */ - zdb_leak_init_exclude_checkpoint(spa, zcb); + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); - if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_enabled(spa, - SPA_FEATURE_DEVICE_REMOVAL)); - (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, - increment_indirect_mapping_cb, zcb, NULL); - } - } else { - /* - * If leak tracing is disabled, we still need to consider - * any checkpointed space in our space verification. - */ - zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa); + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1 << vd->vdev_ashift) { if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1 << vd->vdev_ashift)) { obsolete_bytes += 1 << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); if (bytes_leaked != 0 && (vdev_obsolete_counts_are_precise(vd) || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { + if (dump_opt['L']) + return (B_FALSE); + boolean_t leaks = B_FALSE; - if (!dump_opt['L']) { - vdev_t *rvd = spa->spa_root_vdev; - for (unsigned c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg = vd->vdev_mg; - if (zcb->zcb_vd_obsolete_counts[c] != NULL) { - leaks |= zdb_check_for_obsolete_leaks(vd, zcb); - } + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; +#if DEBUG + metaslab_group_t *mg = vd->vdev_mg; +#endif - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } - /* - * ms_allocatable has been overloaded - * to contain allocated segments. Now that - * we finished traversing all blocks, any - * block that remains in the ms_allocatable - * represents an allocated block that we - * did not claim during the traversal. - * Claimed blocks would have been removed - * from the ms_allocatable. For indirect - * vdevs, space remaining in the tree - * represents parts of the mapping that are - * not referenced, which is not a bug. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_allocatable, - NULL, NULL); - } else { - range_tree_vacate(msp->ms_allocatable, - zdb_leak, vd); - } + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); - if (msp->ms_loaded) { - msp->ms_loaded = B_FALSE; - } + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); } + + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } } - umem_free(zcb->zcb_vd_obsolete_counts, - rvd->vdev_children * sizeof (uint32_t *)); - zcb->zcb_vd_obsolete_counts = NULL; } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + return (leaks); } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; boolean_t leaks = B_FALSE; int err; bzero(&zcb, sizeof (zcb)); (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* - * Load all space maps as SM_ALLOC maps, then traverse the pool - * claiming each block we discover. If the pool is perfectly - * consistent, the space maps will be empty when we're done. - * Anything left over is a leak; any block we can't claim (because - * it's not part of any space map) is a double allocation, - * reference to a freed block, or an unclaimed log block. + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. */ + bzero(&zcb, sizeof (zdb_cb_t)); zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (int i = 0; i < max_ncpus; i++) { (void) zio_wait(spa->spa_async_zio_root[i]); spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } /* * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ zcb.zcb_haderrors |= err; if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (int e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, &zcb); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; - if (total_found == total_alloc) { - if (!dump_opt['L']) - (void) printf("\n\tNo leaks (block sum matches space" - " maps exactly)\n"); - } else { + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (spa_special_class(spa)->mc_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( spa_special_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Special class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_dedup_class(spa)->mc_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( spa_dedup_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Dedup class", (u_longlong_t)alloc, 100.0 * alloc / space); } for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (tzb->zb_ditto_same_ms != 0) { (void) printf("\tDittoed blocks in same metaslab: %llu\n", (longlong_t)tzb->zb_ditto_same_ms); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; const char *typename; /* make sure nicenum has enough space */ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } if (vdev_obsolete_counts_are_precise(vd)) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } if (vdev_obsolete_sm_object(vd) != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } static void zdb_set_skip_mmp(char *target) { spa_t *spa; /* * Disable the activity check to allow examination of * active pools. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL) { spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; } mutex_exit(&spa_namespace_lock); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appened to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { zdb_set_skip_mmp(poolname); error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } } (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX); fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) (void) asprintf(new_path, "%s%s", bogus_name, path_start); if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); - range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); - space_map_update(checkpoint_sm); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, - (range_tree_func_t *)range_tree_verify, + (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { + ASSERT(!dump_opt['L']); + spa_t *checkpoint_spa; char *checkpoint_pool; nvlist_t *config = NULL; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, config, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } /* ARGSUSED */ static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static range_tree_t *mos_refd_objs; static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_top_zap); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_deadlist(&ds->ds_deadlist); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_deadlist(&dp->dp_origin_snap->ds_deadlist); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t class = 0; class < DDT_CLASSES; class++) { for (uint64_t type = 0; type < DDT_TYPES; type++) { for (uint64_t cksum = 0; cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { ddt_t *ddt = spa->spa_ddt[cksum]; mos_obj_refd(ddt->ddt_object[type][class]); } } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (range_tree_contains(mos_refd_objs, object, 1)) { range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; dmu_object_info(mos, object, &doi); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!range_tree_is_empty(mos_refd_objs)) rv = 2; range_tree_vacate(mos_refd_objs, NULL, NULL); range_tree_destroy(mos_refd_objs); return (rv); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { mos_refd_objs = range_tree_create(NULL, NULL); dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { ASSERT0(dataset_feature_count[f]); continue; } (void) feature_get_refcount(spa, &spa_feature_table[f], &refcount); if (dataset_feature_count[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld datasets != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)dataset_feature_count[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) { rc = verify_device_removal_feature_counts(spa); } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_PHYS 0x0020 #define ZDB_FLAG_RAW 0x0040 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 static int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); (void) write(1, buf, size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the heirarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (*s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } /* ARGSUSED */ static int random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) { return (random_get_pseudo_bytes(buf, len)); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:size[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; const char *s, *vdev; char *p, *dup, *flagstr; int i, error; dup = strdup(thing); s = strtok(dup, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); size = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); if (s) flagstr = strdup(s); else flagstr = strdup(""); s = NULL; if (size == 0) s = "size must not be zero"; if (!IS_P2ALIGNED(size, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); free(dup); return; } for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { for (i = 0; flagstr[i]; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Invalid flag: %c\n", flagstr[i]); continue; } flags |= bit; /* If it's not something with an argument, keep going */ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; p = &flagstr[i + 1]; if (bit == ZDB_FLAG_PRINT_BLKPTR) blkptr_offset = strtoull(p, &p, 16); if (*p != ':' && *p != '\0') { (void) printf("***Invalid flag arg: '%s'\n", s); free(dup); return; } } } free(flagstr); vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } psize = size; lsize = size; pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } if (flags & ZDB_FLAG_DECOMPRESS) { /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ enum zio_compress c; void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); abd_copy_to_buf(pbuf2, pabd, psize); VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, random_get_pseudo_bytes_cb, NULL)); VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, SPA_MAXBLOCKSIZE - psize)); for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; lsize -= SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { if (zio_decompress_data(c, pabd, lbuf, psize, lsize) == 0 && zio_decompress_data_buf(c, pbuf2, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (c != ZIO_COMPRESS_FUNCTIONS) break; lsize -= SPA_MINBLOCKSIZE; } umem_free(pbuf2, SPA_MAXBLOCKSIZE); umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize <= psize) { (void) printf("Decompress of %s failed\n", thing); goto out; } buf = lbuf; size = lsize; } else { buf = abd_to_buf(pabd); size = psize; } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, size, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, size, flags); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; char *buf; int err; bzero(&bp, sizeof (bp)); err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } int main(int argc, char **argv) { int c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target, *target_pool; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env; boolean_t target_is_spa = B_TRUE; nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { switch (c) { case 'b': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'O': case 'R': case 's': case 'S': case 'u': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; /* NB: Sort single match options below. */ case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " "of inflight I/Os must be greater " "than 0\n"); usage(); } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; kernel_init(FREAD); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); return (0); } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) return (dump_label(argv[0])); if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; return (dump_path(argv[0], argv[1])); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_pool = strdup(target); *strpbrk(target_pool, "/@") = '\0'; target_is_spa = B_FALSE; targetlen = strlen(target); if (targetlen && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; } else { target_pool = target; } if (dump_opt['e']) { importargs_t args = { 0 }; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; error = zpool_tryimport(g_zfs, target_pool, &cfg, &args); if (error == 0) { if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } /* * Disable the activity check to allow examination of * active pools. */ error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R']) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else { zdb_set_skip_mmp(target); error = open_objset(target, DMU_OST_ANY, FTAG, &os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); argv++; argc--; if (!dump_opt['R']) { if (argc > 0) { zopt_objects = argc; zopt_object = calloc(zopt_objects, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_objects; i++) { errno = 0; zopt_object[i] = strtoull(argv[i], NULL, 0); if (zopt_object[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_dir(os); } else if (zopt_objects > 0 && !dump_opt['m']) { dump_dir(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } if (os != NULL) close_objset(os, FTAG); else spa_close(spa, FTAG); fuid_table_destroy(); dump_debug_buffer(); libzfs_fini(g_zfs); kernel_fini(); return (error); } Index: vendor/illumos/dist/man/man1m/zdb.1m =================================================================== --- vendor/illumos/dist/man/man1m/zdb.1m (revision 354382) +++ vendor/illumos/dist/man/man1m/zdb.1m (revision 354383) @@ -1,409 +1,409 @@ .\" .\" This file and its contents are supplied under the terms of the .\" Common Development and Distribution License ("CDDL"), version 1.0. .\" You may only use this file in accordance with the terms of version .\" 1.0 of the CDDL. .\" .\" A full copy of the text of the CDDL should have accompanied this .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. .\" .\" .\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" .Dd April 14, 2017 .Dt ZDB 1M .Os .Sh NAME .Nm zdb .Nd display zpool debugging and consistency information .Sh SYNOPSIS .Nm .Op Fl AbcdDFGhikLMPsvX .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl I Ar inflight I/Os .Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... .Op Fl t Ar txg .Op Fl U Ar cache .Op Fl x Ar dumpdir .Op Ar poolname Op Ar object ... .Nm .Op Fl AdiPv .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl U Ar cache .Ar dataset Op Ar object ... .Nm .Fl C .Op Fl A .Op Fl U Ar cache .Nm .Fl E .Op Fl A .Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 .Nm .Fl l .Op Fl Aqu .Ar device .Nm .Fl m .Op Fl AFLPX .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl t Ar txg .Op Fl U Ar cache .Ar poolname Op Ar vdev Op Ar metaslab ... .Nm .Fl O .Ar dataset path .Nm .Fl R .Op Fl A .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl U Ar cache .Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags .Nm .Fl S .Op Fl AP .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl U Ar cache .Ar poolname .Sh DESCRIPTION The .Nm utility displays information about a ZFS pool useful for debugging and performs some amount of consistency checking. It is a not a general purpose tool and options .Pq and facilities may change. This is neither a .Xr fsck 1M nor an .Xr fsdb 1M utility. .Pp The output of this command in general reflects the on-disk structure of a ZFS pool, and is inherently unstable. The precise output of most invocations is not documented, a knowledge of ZFS internals is assumed. .Pp If the .Ar dataset argument does not contain any .Qq Sy / or .Qq Sy @ characters, it is interpreted as a pool name. The root dataset can be specified as .Ar pool Ns / .Pq pool name followed by a slash . .Pp When operating on an imported and active pool it is possible, though unlikely, that zdb may interpret inconsistent pool data and behave erratically. .Sh OPTIONS Display options: .Bl -tag -width Ds .It Fl b Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. .It Fl c Verify the checksum of all metadata blocks while printing block statistics .Po see .Fl b .Pc . .Pp If specified multiple times, verify the checksums of all blocks. .It Fl C Display information about the configuration. If specified with no other options, instead display information about the cache file .Pq Pa /etc/zfs/zpool.cache . To specify the cache file to display, see .Fl U . .Pp If specified multiple times, and a pool name is also specified display both the cached configuration and the on-disk configuration. If specified multiple times with .Fl e also display the configuration that would be used were the pool to be imported. .It Fl d Display information about datasets. Specified once, displays basic dataset information: ID, create transaction, size, and object count. .Pp If specified multiple times provides greater and greater verbosity. .Pp If object IDs are specified, display information about those specific objects only. .It Fl D Display deduplication statistics, including the deduplication ratio .Pq Sy dedup , compression ratio .Pq Sy compress , inflation due to the zfs copies property .Pq Sy copies , and an overall effective ratio .Pq Sy dedup No * Sy compress No / Sy copies . .It Fl DD Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk and referenced .Pq logically referenced in the pool block counts and sizes by reference count. .It Fl DDD Display the statistics independently for each deduplication table. .It Fl DDDD Dump the contents of the deduplication tables describing duplicate blocks. .It Fl DDDDD Also dump the contents of the deduplication tables describing unique blocks. .It Fl E Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 Decode and display block from an embedded block pointer specified by the .Ar word arguments. .It Fl h Display pool history similar to .Nm zpool Cm history , but include internal changes, transaction, and dataset information. .It Fl i Display information about intent log .Pq ZIL entries relating to each dataset. If specified multiple times, display counts of each intent log transaction type. .It Fl k Examine the checkpointed state of the pool. Note, the on disk format of the pool is not reverted to the checkpointed state. .It Fl l Ar device Read the vdev labels from the specified device. .Nm Fl l will return 0 if valid label was found, 1 if error occurred, and 2 if no valid labels were found. .Pp If the .Fl q option is also specified, don't print the labels. .Pp If the .Fl u option is also specified, also display the uberblocks on this device. .It Fl L -Disable leak tracing and the loading of space maps. +Disable leak detection and the loading of space maps. By default, .Nm verifies that all non-free blocks are referenced, which can be very expensive. .It Fl m Display the offset, spacemap, and free space of each metaslab. .It Fl mm Also display information about the on-disk free space histogram associated with each metaslab. .It Fl mmm Display the maximum contiguous free space, the in-core free space histogram, and the percentage of free space in each space map. .It Fl mmmm Display every spacemap record. .It Fl M Display the offset, spacemap, and free space of each metaslab. .It Fl MM Also display information about the maximum contiguous free space and the percentage of free space in each space map. .It Fl MMM Display every spacemap record. .It Fl O Ar dataset path Look up the specified .Ar path inside of the .Ar dataset and display its metadata and indirect blocks. Specified .Ar path must be relative to the root of .Ar dataset . This option can be combined with .Fl v for increasing verbosity. .It Xo .Fl R Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags .Xc Read and display a block from the specified device. By default the block is displayed as a hex dump, but see the description of the .Sy r flag, below. .Pp The block is specified in terms of a colon-separated tuple .Ar vdev .Pq an integer vdev identifier .Ar offset .Pq the offset within the vdev .Ar size .Pq the size of the block to read and, optionally, .Ar flags .Pq a set of flags, described below . .Pp .Bl -tag -compact -width "b offset" .It Sy b Ar offset Print block pointer .It Sy d Decompress the block .It Sy e Byte swap the block .It Sy g Dump gang block header .It Sy i Dump indirect block .It Sy r Dump raw uninterpreted block data .El .It Fl s Report statistics on .Nm zdb I/O. Display operation counts, bandwidth, and error counts of I/O to the pool from .Nm . .It Fl S Simulate the effects of deduplication, constructing a DDT and then display that DDT as with .Fl DD . .It Fl u Display the current uberblock. .El .Pp Other options: .Bl -tag -width Ds .It Fl A Do not abort should any assertion fail. .It Fl AA Enable panic recovery, certain errors which would otherwise be fatal are demoted to warnings. .It Fl AAA Do not abort if asserts fail and also enable panic recovery. .It Fl e Op Fl p Ar path ... Operate on an exported pool, not present in .Pa /etc/zfs/zpool.cache . The .Fl p flag specifies the path under which devices are to be searched. .It Fl x Ar dumpdir All blocks accessed will be copied to files in the specified directory. The blocks will be placed in sparse files whose name is the same as that of the file or device read. .Nm can be then run on the generated files. Note that the .Fl bbc flags are sufficient to access .Pq and thus copy all metadata on the pool. .It Fl F Attempt to make an unreadable pool readable by trying progressively older transactions. .It Fl G Dump the contents of the zfs_dbgmsg buffer before exiting .Nm . zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information. .It Fl I Ar inflight I/Os Limit the number of outstanding checksum I/Os to the specified value. The default value is 200. This option affects the performance of the .Fl c option. .It Fl o Ar var Ns = Ns Ar value ... Set the given global libzpool variable to the provided value. The value must be an unsigned 32-bit integer. Currently only little-endian systems are supported to avoid accidentally setting the high 32 bits of 64-bit variables. .It Fl P Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather than 1M. .It Fl t Ar transaction Specify the highest transaction to use when searching for uberblocks. See also the .Fl u and .Fl l options for a means to see the available uberblocks and their associated transaction numbers. .It Fl U Ar cachefile Use a cache file other than .Pa /etc/zfs/zpool.cache . .It Fl v Enable verbosity. Specify multiple times for increased verbosity. .It Fl V Attempt verbatim import. This mimics the behavior of the kernel when loading a pool from a cachefile. Only usable with .Fl e . .It Fl X Attempt .Qq extreme transaction rewind, that is attempt the same recovery as .Fl F but read transactions otherwise deemed too old. .El .Pp Specifying a display option more than once enables verbosity for only that option, with more occurrences enabling more verbosity. .Pp If no options are specified, all information about the named pool will be displayed at default verbosity. .Sh EXAMPLES .Bl -tag -width Ds .It Xo .Sy Example 1 Display the configuration of imported pool .Pa rpool .Xc .Bd -literal # zdb -C rpool MOS Configuration: version: 28 name: 'rpool' ... .Ed .It Xo .Sy Example 2 Display basic dataset information about .Pa rpool .Xc .Bd -literal # zdb -d rpool Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects ... .Ed .It Xo .Sy Example 3 Display basic information about object 0 in .Pa rpool/export/home .Xc .Bd -literal # zdb -d rpool/export/home 0 Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects Object lvl iblk dblk dsize lsize %full type 0 7 16K 16K 15.0K 16K 25.00 DMU dnode .Ed .It Xo .Sy Example 4 Display the predicted effect of enabling deduplication on .Pa rpool .Xc .Bd -literal # zdb -S rpool Simulated DDT histogram: bucket allocated referenced ______ ______________________________ ______________________________ refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE ------ ------ ----- ----- ----- ------ ----- ----- ----- 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G ... dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 .Ed .El .Sh SEE ALSO .Xr zfs 1M , .Xr zpool 1M Index: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c (revision 354383) @@ -1,4222 +1,4570 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * Since we can touch multiple metaslabs (and their respective space maps) * with each transaction group, we benefit from having a smaller space map * block size since it allows us to issue more I/O operations scattered * around the disk. */ int zfs_metaslab_sm_blksz = (1 << 12); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ int zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ int zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmenation metric (measured as a percentage) is less than or equal to * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold * then it will be skipped unless all metaslab groups within the metaslab * class have also crossed this threshold. */ int zfs_mg_fragmentation_threshold = 85; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ int zfs_metaslab_fragmentation_threshold = 70; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = 0; /* * When set will prevent metaslabs from being unloaded. */ int metaslab_debug_unload = 0; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; /* * A metaslab is considered "free" if it contains a contiguous * segment which is greater than metaslab_min_alloc_size. */ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; /* * Percentage of all cpus that can be used by the metaslab taskq. */ int metaslab_load_pct = 50; /* * Determines how many txgs a metaslab may remain loaded without having any * allocations from it. As long as a metaslab continues to be used we will * keep it loaded. */ int metaslab_unload_delay = TXG_SIZE * 2; /* * Max number of metaslabs per group to preload. */ int metaslab_preload_limit = SPA_DVAS_PER_BP; /* * Enable/disable preloading of metaslab. */ boolean_t metaslab_preload_enabled = B_TRUE; /* * Enable/disable fragmentation weighting on metaslabs. */ boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; /* * Enable/disable lba weighting (i.e. outer tracks are given preference). */ boolean_t metaslab_lba_weighting_enabled = B_TRUE; /* * Enable/disable metaslab group biasing. */ boolean_t metaslab_bias_enabled = B_TRUE; /* * Enable/disable remapping of indirect DVAs to their concrete vdevs. */ boolean_t zfs_remap_blkptr_enable = B_TRUE; /* * Enable/disable segment-based metaslab selection. */ boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; /* * When using segment-based metaslab selection, we will continue * allocating from the active metaslab until we have exhausted * zfs_metaslab_switch_threshold of its buckets. */ int zfs_metaslab_switch_threshold = 2; /* * Internal switch to enable/disable the metaslab allocation tracing * facility. */ boolean_t metaslab_trace_enabled = B_TRUE; /* * Maximum entries that the metaslab allocation tracing facility will keep * in a given list when running in non-debug mode. We limit the number * of entries in non-debug mode to prevent us from using up too much memory. * The limit should be sufficiently large that we don't expect any allocation * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (zfs_refcount_t), KM_SLEEP); mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { ASSERT(mc->mc_rotor == NULL); ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) zfs_refcount_destroy(&mc->mc_alloc_slots[i]); kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * sizeof (zfs_refcount_t)); kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_rotor); return (0); } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { uint64_t tspace; vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. * Adjust by efi system partition size. */ tspace = tvd->vdev_max_asize - tvd->vdev_asize; if (tspace > mc->mc_spa->spa_bootsize) { tspace -= mc->mc_spa->spa_bootsize; } space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (AVL_CMP(m1->ms_start, m2->ms_start)); } +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + /* * Verify that the space accounting on disk matches the in-core range_trees. */ -void +static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocated = 0; + uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an allocated - * space map. Calling this in non-syncing context does not - * provide a consistent view of the metaslab since we're performing - * allocations in the future. + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; - sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - - space_map_alloc_delta(msp->ms_sm); + /* + * Even though the smp_alloc field can get negative (e.g. + * see vdev_checkpoint_sm), that should never be the case + * when it come's to a metaslab's space map. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + /* - * Account for future allocations since we would have already - * deducted that space from the ms_freetree. + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocated += + allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (zfs_refcount_t), KM_SLEEP); mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); kmem_free(mg->mg_secondaries, mg->mg_allocators * sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_initialize_lock); cv_destroy(&mg->mg_ms_initialize_cv); for (int i = 0; i < mg->mg_allocators; i++) { zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * sizeof (zfs_refcount_t)); kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * sizeof (uint64_t)); kmem_free(mg, sizeof (metaslab_group_t)); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } mc->mc_rotor = mg; } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait(mg->mg_taskq); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_t *msp = mg->mg_primaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mg->mg_secondaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mc->mc_rotor = NULL; } else { mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } mg->mg_prev = NULL; mg->mg_next = NULL; } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; vdev_t *vd = mg->mg_vd; uint64_t ashift = vd->vdev_ashift; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) continue; for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; if (msp->ms_group != mg) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, uint64_t psize, int allocator) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if ((mc != spa_normal_class(spa) && mc != spa_special_class(spa) && mc != spa_dedup_class(spa)) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mg_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); qdepth = zfs_refcount_count( &mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's * the only allocatable metasable group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; qdepth = zfs_refcount_count( &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree. Tree is sorted * by size, larger sizes at the end of the tree. */ static int metaslab_rangesize_compare(const void *x1, const void *x2) { const range_seg_t *r1 = x1; const range_seg_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = AVL_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); return (AVL_CMP(r1->rs_start, r2->rs_start)); } /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. */ static void metaslab_rt_create(range_tree_t *rt, void *arg) { metaslab_t *msp = arg; ASSERT3P(rt->rt_arg, ==, msp); ASSERT(msp->ms_allocatable == NULL); avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); } /* * Destroy the block allocator specific components. */ static void metaslab_rt_destroy(range_tree_t *rt, void *arg) { metaslab_t *msp = arg; ASSERT3P(rt->rt_arg, ==, msp); ASSERT3P(msp->ms_allocatable, ==, rt); ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); avl_destroy(&msp->ms_allocatable_by_size); } static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_t *msp = arg; ASSERT3P(rt->rt_arg, ==, msp); ASSERT3P(msp->ms_allocatable, ==, rt); VERIFY(!msp->ms_condensing); avl_add(&msp->ms_allocatable_by_size, rs); } static void metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_t *msp = arg; ASSERT3P(rt->rt_arg, ==, msp); ASSERT3P(msp->ms_allocatable, ==, rt); VERIFY(!msp->ms_condensing); avl_remove(&msp->ms_allocatable_by_size, rs); } static void metaslab_rt_vacate(range_tree_t *rt, void *arg) { metaslab_t *msp = arg; ASSERT3P(rt->rt_arg, ==, msp); ASSERT3P(msp->ms_allocatable, ==, rt); /* * Normally one would walk the tree freeing nodes along the way. * Since the nodes are shared with the range trees we can avoid * walking all nodes and just reinitialize the avl tree. The nodes * will be freed by the range tree, so we don't want to free them here. */ avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); } static range_tree_ops_t metaslab_rt_ops = { metaslab_rt_create, metaslab_rt_destroy, metaslab_rt_add, metaslab_rt_remove, metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_block_maxsize(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL || (rs = avl_last(t)) == NULL) return (0ULL); return (rs->rs_end - rs->rs_start); } static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { range_seg_t *rs, rsearch; avl_index_t where; rsearch.rs_start = start; rsearch.rs_end = start + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL) { rs = avl_nearest(t, where, AVL_AFTER); } return (rs); } /* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); while (rs != NULL) { uint64_t offset = P2ROUNDUP(rs->rs_start, align); if (offset + size <= rs->rs_end) { *cursor = offset + size; return (offset); } rs = AVL_NEXT(t, rs); } /* * If we know we've searched the whole map (*cursor == 0), give up. * Otherwise, reset the cursor to the beginning and try again. */ if (*cursor == 0) return (-1ULL); *cursor = 0; return (metaslab_block_picker(t, cursor, size, align)); } /* * ========================================================================== * The first-fit block allocator * ========================================================================== */ static uint64_t metaslab_ff_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_allocatable->rt_root; return (metaslab_block_picker(t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { metaslab_ff_alloc }; /* * ========================================================================== * Dynamic block allocator - * Uses the first fit allocation scheme until space get low and then * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold * and metaslab_df_free_pct to determine when to switch the allocation scheme. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &rt->rt_root; uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); /* * If we're running low on space switch to using the size * sorted AVL tree (best-fit). */ if (max_size < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { t = &msp->ms_allocatable_by_size; *cursor = 0; } return (metaslab_block_picker(t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { metaslab_df_alloc }; /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); *cursor = rs->rs_start; *cursor_end = rs->rs_end; } offset = *cursor; *cursor += size; return (offset); } static metaslab_ops_t metaslab_cf_ops = { metaslab_cf_alloc }; /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { avl_tree_t *t = &msp->ms_allocatable->rt_root; avl_index_t where; range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_block_maxsize(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); rsearch.rs_start = *cursor; rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; rsearch.rs_end = MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift)); rs = avl_find(t, &rsearch, &where); if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); } if ((rs->rs_end - rs->rs_start) >= size) { *cursor = rs->rs_start + size; return (rs->rs_start); } return (-1ULL); } static metaslab_ops_t metaslab_ndf_ops = { metaslab_ndf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== * Metaslabs * ========================================================================== */ +static void +metaslab_aux_histograms_clear(metaslab_t *msp) +{ + /* + * Auxiliary histograms are only cleared when resetting them, + * which can only happen while the metaslab is loaded. + */ + ASSERT(msp->ms_loaded); + + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); +} + +static void +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, + range_tree_t *rt) +{ + /* + * This is modeled after space_map_histogram_add(), so refer to that + * function for implementation details. We want this to work like + * the space map histogram, and not the range tree histogram, as we + * are essentially constructing a delta that will be later subtracted + * from the space map histogram. + */ + int idx = 0; + for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + ASSERT3U(i, >=, idx + shift); + histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); + + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + /* + * Called at every sync pass that the metaslab gets synced. + * + * The reason is that we want our auxiliary histograms to be updated + * wherever the metaslab's space map histogram is updated. This way + * we stay consistent on which parts of the metaslab space map's + * histogram are currently not available for allocations (e.g because + * they are in the defer, freed, and freeing trees). + */ +static void +metaslab_aux_histograms_update(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(sm != NULL); + + /* + * This is similar to the metaslab's space map histogram updates + * that take place in metaslab_sync(). The only difference is that + * we only care about segments that haven't made it into the + * ms_allocatable tree yet. + */ + if (msp->ms_loaded) { + metaslab_aux_histograms_clear(msp); + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freed); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + metaslab_aux_histogram_add(msp->ms_deferhist[t], + sm->sm_shift, msp->ms_defer[t]); + } + } + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freeing); +} + +/* + * Called every time we are done syncing (writing to) the metaslab, + * i.e. at the end of each sync pass. + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] + */ +static void +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + + if (sm == NULL) { + /* + * We came here from metaslab_init() when creating/opening a + * pool, looking at a metaslab that hasn't had any allocations + * yet. + */ + return; + } + + /* + * This is similar to the actions that we take for the ms_freed + * and ms_defer trees in metaslab_sync_done(). + */ + uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; + if (defer_allowed) { + bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + sizeof (msp->ms_synchist)); + } else { + bzero(msp->ms_deferhist[hist_index], + sizeof (msp->ms_deferhist[hist_index])); + } + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); +} + +/* + * Ensure that the metaslab's weight and fragmentation are consistent + * with the contents of the histogram (either the range tree's histogram + * or the space map's depending whether the metaslab is loaded). + */ +static void +metaslab_verify_weight_and_frag(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* see comment in metaslab_verify_unflushed_changes() */ + if (msp->ms_group == NULL) + return; + + /* + * Devices being removed always return a weight of 0 and leave + * fragmentation and ms_max_size as is - there is nothing for + * us to verify here. + */ + vdev_t *vd = msp->ms_group->mg_vd; + if (vd->vdev_removing) + return; + + /* + * If the metaslab is dirty it probably means that we've done + * some allocations or frees that have changed our histograms + * and thus the weight. + */ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&vd->vdev_ms_list, msp, t)) + return; + } + + /* + * This verification checks that our in-memory state is consistent + * with what's on disk. If the pool is read-only then there aren't + * any changes and we just have the initially-loaded state. + */ + if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) + return; + + /* some extra verification for in-core tree if you can */ + if (msp->ms_loaded) { + range_tree_stat_verify(msp->ms_allocatable); + VERIFY(space_map_histogram_verify(msp->ms_sm, + msp->ms_allocatable)); + } + + uint64_t weight = msp->ms_weight; + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint64_t frag = msp->ms_fragmentation; + uint64_t max_segsize = msp->ms_max_size; + + msp->ms_weight = 0; + msp->ms_fragmentation = 0; + msp->ms_max_size = 0; + + /* + * This function is used for verification purposes. Regardless of + * whether metaslab_weight() thinks this metaslab should be active or + * not, we want to ensure that the actual weight (and therefore the + * value of ms_weight) would be the same if it was to be recalculated + * at this point. + */ + msp->ms_weight = metaslab_weight(msp) | was_active; + + VERIFY3U(max_segsize, ==, msp->ms_max_size); + + /* + * If the weight type changed then there is no point in doing + * verification. Revert fields to their original values. + */ + if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || + (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + msp->ms_fragmentation = frag; + msp->ms_weight = weight; + return; + } + + VERIFY3U(msp->ms_fragmentation, ==, frag); + VERIFY3U(msp->ms_weight, ==, weight); +} + +/* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); + ASSERT(!msp->ms_condensing); /* - * Nobody else can manipulate a loading metaslab, so it's now safe - * to drop the lock. This way we don't have to hold the lock while - * reading the spacemap from disk. + * We temporarily drop the lock to unblock other operations while we + * are reading the space map. Therefore, metaslab_sync() and + * metaslab_sync_done() can run at the same time as we do. + * + * metaslab_sync() can append to the space map while we are loading. + * Therefore we load only entries that existed when we started the + * load. Additionally, metaslab_sync_done() has to wait for the load + * to complete because there are potential races like metaslab_load() + * loading parts of the space map that are currently being appended + * by metaslab_sync(). If we didn't, the ms_allocatable would have + * entries that metaslab_sync_done() would try to re-add later. + * + * That's why before dropping the lock we remember the synced length + * of the metaslab and read up to that point of the space map, + * ignoring entries appended by metaslab_sync() that happen after we + * drop the lock. */ + uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); - /* - * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to ms_allocatable. - */ if (msp->ms_sm != NULL) { - error = space_map_load(msp->ms_sm, msp->ms_allocatable, - SM_FREE); + error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, + SM_FREE, length); } else { + /* + * The space map has not been allocated yet, so treat + * all the space in the metaslab as free and add it to the + * ms_allocatable tree. + */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); } + /* + * We need to grab the ms_sync_lock to prevent metaslab_sync() from + * changing the ms_sm and the metaslab's range trees while we are + * about to use them and populate the ms_allocatable. The ms_lock + * is insufficient for this because metaslab_sync() doesn't hold + * the ms_lock while writing the ms_checkpointing tree to disk. + */ + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); - if (error != 0) + if (error != 0) { + mutex_exit(&msp->ms_sync_lock); return (error); + } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* - * If the metaslab already has a spacemap, then we need to - * remove all segments from the defer tree; otherwise, the - * metaslab is completely empty and we can skip this. + * The ms_allocatable contains the segments that exist in the + * ms_defer trees [see ms_synced_length]. Thus we need to remove + * them from ms_allocatable as they will be added again in + * metaslab_sync_done(). */ - if (msp->ms_sm != NULL) { - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); } + + /* + * Call metaslab_recalculate_weight_and_sort() now that the + * metaslab is loaded so we get the metaslab's real weight. + * + * Unless this metaslab was created with older software and + * has not yet been converted to use segment-based weight, we + * expect the new weight to be better or equal to the weight + * that the metaslab had while it was not loaded. This is + * because the old weight does not take into account the + * consolidation of adjacent segments between TXGs. [see + * comment for ms_synchist and ms_deferhist[] for more info] + */ + uint64_t weight = msp->ms_weight; + metaslab_recalculate_weight_and_sort(msp); + if (!WEIGHT_IS_SPACEBASED(weight)) + ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_verify_space(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_sync_lock); + return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); + ASSERT(!msp->ms_condensing); msp->ms_loading = B_TRUE; int error = metaslab_load_impl(msp); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_verify_weight_and_frag(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; + + /* + * We explicitly recalculate the metaslab's weight based on its space + * map (as it is now not loaded). We want unload metaslabs to always + * have their weights calculated from the space map histograms, while + * loaded ones have it calculated from their in-core range tree + * [see metaslab_load()]. This way, the weight reflects the information + * available in-core, whether it is loaded or not + * + * If ms_group == NULL means that we came here from metaslab_fini(), + * at which point it doesn't make sense for us to do the recalculation + * and the sorting. + */ + if (msp->ms_group != NULL) + metaslab_recalculate_weight_and_sort(msp); } static void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. + * + * Note: + * When called from vdev_expand(), we can't call into the DMU as + * we are holding the spa_config_lock as a writer and we would + * deadlock [see relevant comment in vdev_metaslab_init()]. in + * that case, the object parameter is zero though, so we won't + * call into the DMU. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); + ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); + ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } /* - * We create the main range tree here, but we don't create the + * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that we'd - * data fault on any attempt to use this metaslab before it's ready. + * addition of new space; and for debugging, it ensures that + * we'd data fault on any attempt to use this metaslab before + * it's ready. */ ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ - if (txg <= TXG_INITIAL) + if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); + metaslab_space_update(vd, mg->mg_class, + metaslab_allocated_space(ms), 0, 0); + } /* * If metaslab_debug_load is set and we're initializing a metaslab * that has an allocated space map object then load the space map * so that we can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { mutex_enter(&ms->ms_lock); VERIFY0(metaslab_load(ms)); mutex_exit(&ms->ms_lock); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); metaslab_space_update(vd, mg->mg_class, - -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); range_tree_destroy(msp->ms_checkpointing); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } #define FRAGMENTATION_TABLE_SIZE 17 /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and - * multiplying that by the fragmetation metric in this table. Doing + * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. A 10% change in fragmentation equates to approximately * double the number of segments. * * This table defines 0% fragmented space using 16MB segments. Testing has * shown that segments that are greater than or equal to 16MB do not suffer * from drastic performance problems. Using this value, we derive the rest * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ 95, /* 4K */ 90, /* 8K */ 80, /* 16K */ 70, /* 32K */ 60, /* 64K */ 50, /* 128K */ 40, /* 256K */ 30, /* 512K */ 20, /* 1M */ 15, /* 2M */ 10, /* 4M */ 5, /* 8M */ 0 /* 16M */ }; /* - * Calclate the metaslab's fragmentation metric. A return value - * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does - * not support this metric. Otherwise, the return value should be in the - * range [0, 100]. + * Calculate the metaslab's fragmentation metric and set ms_fragmentation. + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not + * been upgraded and does not support this metric. Otherwise, the return + * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. */ if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", txg, msp->ms_id, vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ - space = msp->ms_size - space_map_allocated(msp->ms_sm); + space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. This should only * be called after a sync pass has completely finished since the on-disk * information is updated in metaslab_sync(). */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { - uint64_t weight = 0; + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { - if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { - WEIGHT_SET_COUNT(weight, - msp->ms_sm->sm_phys->smp_histogram[i]); - WEIGHT_SET_INDEX(weight, i + - msp->ms_sm->sm_shift); + ASSERT3U(sm->sm_phys->smp_histogram[i], >=, + deferspace_histogram[i]); + uint64_t count = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (count != 0) { + WEIGHT_SET_COUNT(weight, count); + WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ - if (space_map_allocated(msp->ms_sm) == 0) { + if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ - if (space_map_allocated(msp->ms_sm) == msp->ms_size) + if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab has a maximum size then we can quickly determine if the desired * allocation size can be satisfied. Otherwise, if we're using segment-based * weighting then we can determine the maximum allocation that this metaslab * can accommodate based on the index encoded in the weight. If we're using * space-based weights then rely on the entire weight (excluding the weight * type bit). */ boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize) { boolean_t should_allocate; if (msp->ms_max_size != 0) return (msp->ms_max_size >= asize); if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If this vdev is in the process of being removed, there is nothing * for us to do here. */ if (vd->vdev_removing) return (0); metaslab_set_fragmentation(msp); /* * Update the maximum size if the metaslab is loaded. This will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. */ if (msp->ms_loaded) msp->ms_max_size = metaslab_block_maxsize(msp); + else + ASSERT0(msp->ms_max_size); /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } +void +metaslab_recalculate_weight_and_sort(metaslab_t *msp) +{ + /* note: we preserve the mask (e.g. indication of primary, etc..) */ + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(msp->ms_group, msp, + metaslab_weight(msp) | was_active); +} + static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) return (0); metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? mg->mg_primaries : mg->mg_secondaries); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); if (arr[allocator] != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } arr[allocator] = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { /* * The metaslab was activated for another allocator * while we were waiting, we should reselect. */ return (EBUSY); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); if (msp->ms_primary) { ASSERT3U(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mg->mg_primaries[msp->ms_allocator] = NULL; } else { ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); mg->mg_secondaries[msp->ms_allocator] = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_is_empty(msp->ms_allocatable)); ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhaused the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslabs with larger contiguous region if any remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) { taskq_wait(mg->mg_taskq); return; } mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, msp, TQ_SLEEP) != NULL); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance * for inefficiency. We would like to use the following criteria to make * our decision: * * 1. The size of the space map object should not dramatically increase as a * result of writing out the free space range tree. * * 2. The minimal on-disk space map representation is zfs_condense_pct/100 * times the size than the free space range tree representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). * * 3. The on-disk size of the space map should actually decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); /* * Allocations and frees in early passes are generally more space * efficient (in terms of blocks described in space map entries) * than the ones in later passes (e.g. we don't compress after * sync pass 5) and condensing a metaslab multiple times in a txg * could degrade performance. * * Thus we prefer condensing each metaslab at most once every txg at * the earliest sync pass possible. If a metaslab is eligible for * condensing again after being considered for condensing within the * same txg, it will hopefully be dirty in the next txg where it will * be condensed at an earlier pass. */ if (msp->ms_condense_checked_txg == current_txg) return (B_FALSE); msp->ms_condense_checked_txg = current_txg; /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (avl_is_empty(&msp->ms_allocatable_by_size) || msp->ms_condense_wanted) return (B_TRUE); uint64_t object_size = space_map_length(msp->ms_sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed by * the entries of the free range tree. */ static void metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; /* * Create an range tree that is 100% allocated. We remove segments * that have been freed in this txg, any deferred frees that exist, * and any allocation in the future. Removing segments should be * a relatively inexpensive operation since we expect these trees to * have a small number of nodes. */ condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } /* * We're about to drop the metaslab's lock thus allowing * other consumers to change it's content. Set the * metaslab's ms_condensing flag to ensure that * allocations on this metaslab do not occur while we're * in the middle of committing it to disk. This is only critical * for ms_allocatable as all other range trees use per txg * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); /* * While we would ideally like to create a space map representation * that consists only of allocation records, doing so can be * prohibitively expensive because the in-core free tree can be * large, and therefore computationally expensive to subtract * from the condense_tree. Instead we sync out two trees, a cheap * allocation only tree followed by the in-core free tree. While not * optimal, this is typically close to optimal, and much cheaper to * compute. */ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_freeing == NULL) { ASSERT3P(alloctree, ==, NULL); return; } ASSERT3P(alloctree, !=, NULL); ASSERT3P(msp->ms_freeing, !=, NULL); ASSERT3P(msp->ms_freed, !=, NULL); ASSERT3P(msp->ms_checkpointing, !=, NULL); /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense and it's loaded, we need to let it through. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted)) return; VERIFY(txg <= spa_final_dirty_txg(spa)); /* - * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_allocatable. No other - * thread can be modifying this txg's alloc, freeing, + * The only state that can actually be changing concurrently + * with metaslab_sync() is the metaslab's ms_allocatable. No + * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we - * could call into the DMU, because the DMU can call down to us - * (e.g. via zio_free()) at any time. + * could call into the DMU, because the DMU can call down to + * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. Note - * that the ms_lock is insufficient for this, because it is dropped - * by space_map_write(). + * concurrently, and it is locked out by the ms_sync_lock. + * Note that the ms_lock is insufficient for this, because it + * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (msp->ms_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + ASSERT0(metaslab_allocated_space(msp)); } if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } + msp->ms_allocated_space += range_tree_space(alloctree); + ASSERT3U(msp->ms_allocated_space, >=, + range_tree_space(msp->ms_freeing)); + msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); + if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); - space_map_update(vd->vdev_checkpoint_sm); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, - -vd->vdev_checkpoint_sm->sm_alloc); + -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); + metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeing and - * freed. We can safely do this since the freed_tree is - * guaranteed to be empty on the initial pass. + * and instead will just swap the pointers for freeing and freed. + * We can safely do this since the freed_tree is guaranteed to be + * empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } + msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); if (object != space_map_object(msp->ms_sm)) { object = space_map_object(msp->ms_sm); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); /* * If this metaslab is just becoming available, initialize its * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); msp->ms_allocating[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_freeing, ==, NULL); msp->ms_freeing = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_freed, ==, NULL); msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT(msp->ms_defer[t] == NULL); msp->ms_defer[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } defer_delta = 0; - alloc_delta = space_map_alloc_delta(msp->ms_sm); + alloc_delta = msp->ms_allocated_this_txg - + range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); /* * If there's a metaslab_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. */ metaslab_load_wait(msp); /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { range_tree_swap(&msp->ms_freed, defer_tree); } else { range_tree_vacate(msp->ms_freed, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } - space_map_update(msp->ms_sm); + msp->ms_synced_length = space_map_length(msp->ms_sm); + msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } + /* - * Calculate the new weights before unloading any metaslabs. - * This will give us the most accurate weighting. + * Re-sort metaslab within its group now that we've adjusted + * its allocatable space. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp) | - (msp->ms_weight & METASLAB_ACTIVE_MASK)); + metaslab_recalculate_weight_and_sort(msp); /* * If the metaslab is loaded and we've not tried to load or allocate * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } if (!metaslab_debug_unload) metaslab_unload(msp); } ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); + msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ kstat_t *metaslab_trace_ksp; kstat_named_t metaslab_trace_over_limit; void metaslab_alloc_trace_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); if (metaslab_trace_ksp != NULL) { metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; kstat_named_init(&metaslab_trace_over_limit, "metaslab_trace_over_limit", KSTAT_DATA_UINT64); kstat_install(metaslab_trace_ksp); } } void metaslab_alloc_trace_fini(void) { if (metaslab_trace_ksp != NULL) { kstat_delete(metaslab_trace_ksp); metaslab_trace_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef DEBUG panic("too many entries in allocation list"); #endif atomic_inc_64(&metaslab_trace_over_limit.value.ui64); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } metaslab_alloc_trace_t *mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; while (cur < max) { if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], cur, cur + 1) == cur) { atomic_inc_64( &mg->mg_class->mc_alloc_max_slots[allocator]); return; } cur = mg->mg_cur_max_alloc_queue_depth[allocator]; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; VERIFY(zfs_refcount_not_held( &mg->mg_alloc_queue_depth[allocator], tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_initializing); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_block_maxsize(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!metaslab_should_allocate(msp, asize)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or being * initialized, skip it. */ if (msp->ms_condensing || msp->ms_initializing > 0) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; for (i = 0; i < d; i++) { if (want_unique && !metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight; activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mg->mg_primaries[allocator] != NULL) { msp = mg->mg_primaries[allocator]; was_active = B_TRUE; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mg->mg_secondaries[allocator] != NULL) { msp = mg->mg_secondaries[allocator]; was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, want_unique, asize, allocator, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to reselect * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab is freshly activated for an allocator that * isn't the one we're allocating from, or if it's a primary and * we're seeking a secondary (or vice versa), we go back and * select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { mutex_exit(&msp->ms_lock); continue; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } if (metaslab_activate(msp, allocator, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } msp->ms_selected_txg = txg; /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() can * accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again as * we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_initializing > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_INITIALIZING, allocator); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { uint64_t weight = metaslab_block_maxsize(msp); WEIGHT_SET_SPACEBASED(weight); metaslab_passivate(msp, weight); } else { metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); } /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) mg = mg->mg_next; } else { mg = mc->mc_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { ASSERT(mc->mc_rotor != NULL); mg = mc->mc_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, psize, allocator); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } ASSERT(mg->mg_initialized); /* * Avoid writing single-copy data to a failing, * non-redundant vdev, unless we've already tried all * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* * If we don't need to try hard, then require that the * block be on an different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, !try_hard, dva, d, allocator); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * For example, if a device is 80% full * and the pool is 20% full then we should * reduce allocations by 60% on this device. * * mg_bias = (20 - 80) * 512K / 100 = -307K * * This reduces allocations by 307K for this * iteration. */ mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, do so now. */ if (!try_hard) { try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); range_tree_add(msp->ms_checkpointing, offset, asize); } else { range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } /* ARGSUSED */ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset); ASSERT(0); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); uint64_t reserved_slots = zfs_refcount_count(&mc->mc_alloc_slots[allocator]); if (reserved_slots < max) available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags) || flags & METASLAB_MUST_RESERVE) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { reserved_slots = zfs_refcount_add(&mc->mc_alloc_slots[allocator], zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; } mutex_exit(&mc->mc_lock); return (slot_reserved); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], zio); } mutex_exit(&mc->mc_lock); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (ENXIO); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); /* * No need to fail in that case; someone else has activated the * metaslab, but that doesn't preclude us from using it. */ if (error == EBUSY) error = 0; if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; /* ARGSUSED */ static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(1M) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; + dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT(bp->blk_birth == 0); ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, txg); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (bp->blk_birth <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } /* ARGSUSED */ static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - if (msp->ms_loaded) - range_tree_verify(msp->ms_allocatable, offset, size); + if (msp->ms_loaded) { + range_tree_verify_not_present(msp->ms_allocatable, + offset, size); + } - range_tree_verify(msp->ms_freeing, offset, size); - range_tree_verify(msp->ms_checkpointing, offset, size); - range_tree_verify(msp->ms_freed, offset, size); + range_tree_verify_not_present(msp->ms_freeing, offset, size); + range_tree_verify_not_present(msp->ms_checkpointing, offset, size); + range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_defer[j], offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c (revision 354383) @@ -1,421 +1,419 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include kmem_cache_t *range_seg_cache; void range_tree_init(void) { ASSERT(range_seg_cache == NULL); range_seg_cache = kmem_cache_create("range_seg_cache", sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void range_tree_fini(void) { kmem_cache_destroy(range_seg_cache); range_seg_cache = NULL; } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = avl_first(&rt->rt_root); rs != NULL; rs = AVL_NEXT(&rt->rt_root, rs)) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; hist[idx]++; ASSERT3U(hist[idx], !=, 0); } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", i, hist, hist[i], rt->rt_histogram[i]); } VERIFY3U(hist[i], ==, rt->rt_histogram[i]); } } static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); rt->rt_histogram[idx]++; ASSERT3U(rt->rt_histogram[idx], !=, 0); } static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); ASSERT3U(rt->rt_histogram[idx], !=, 0); rt->rt_histogram[idx]--; } /* * NOTE: caller is responsible for all locking. */ static int range_tree_seg_compare(const void *x1, const void *x2) { const range_seg_t *r1 = (const range_seg_t *)x1; const range_seg_t *r2 = (const range_seg_t *)x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } range_tree_t * range_tree_create(range_tree_ops_t *ops, void *arg) { range_tree_t *rt; rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); rt->rt_ops = ops; rt->rt_arg = arg; if (rt->rt_ops != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } void range_tree_destroy(range_tree_t *rt) { VERIFY0(rt->rt_space); if (rt->rt_ops != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); avl_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_add(void *arg, uint64_t start, uint64_t size) { range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs_before, *rs_after, *rs; uint64_t end = start + size; boolean_t merge_before, merge_after; VERIFY(size != 0); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { zfs_panic_recover("zfs: allocating allocated segment" "(offset=%llu size=%llu)\n", (longlong_t)start, (longlong_t)size); return; } /* Make sure we don't overlap with either of our neighbors */ VERIFY3P(rs, ==, NULL); rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); merge_before = (rs_before != NULL && rs_before->rs_end == start); merge_after = (rs_after != NULL && rs_after->rs_start == end); if (merge_before && merge_after) { avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); range_tree_stat_decr(rt, rs_before); rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); range_tree_stat_decr(rt, rs_after); rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } if (rt->rt_ops != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); range_tree_stat_incr(rt, rs); rt->rt_space += size; } void range_tree_remove(void *arg, uint64_t start, uint64_t size) { range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs, *newseg; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { zfs_panic_recover("zfs: freeing free segment " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size); return; } VERIFY3U(rs->rs_start, <=, start); VERIFY3U(rs->rs_end, >=, end); left_over = (rs->rs_start != start); right_over = (rs->rs_end != end); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); newseg->rs_start = end; newseg->rs_end = rs->rs_end; range_tree_stat_incr(rt, newseg); rs->rs_end = start; avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); if (rt->rt_ops != NULL) rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); } else if (left_over) { rs->rs_end = start; } else if (right_over) { rs->rs_start = end; } else { avl_remove(&rt->rt_root, rs); kmem_cache_free(range_seg_cache, rs); rs = NULL; } if (rs != NULL) { range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } rt->rt_space -= size; } static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t rsearch; uint64_t end = start + size; VERIFY(size != 0); rsearch.rs_start = start; rsearch.rs_end = end; return (avl_find(&rt->rt_root, &rsearch, NULL)); } static range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs = range_tree_find_impl(rt, start, size); if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) return (rs); return (NULL); } void -range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) +range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) { - range_seg_t *rs; - - rs = range_tree_find(rt, off, size); + range_seg_t *rs = range_tree_find(rt, off, size); if (rs != NULL) - panic("freeing free block; rs=%p", (void *)rs); + panic("segment already in tree; rs=%p", (void *)rs); } boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) { return (range_tree_find(rt, start, size) != NULL); } /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. */ void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs; if (size == 0) return; while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { uint64_t free_start = MAX(rs->rs_start, start); uint64_t free_end = MIN(rs->rs_end, start + size); range_tree_remove(rt, free_start, free_end - free_start); } } void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) { range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; *rtdst = rt; } void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { range_seg_t *rs; void *cookie = NULL; if (rt->rt_ops != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { if (func != NULL) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); kmem_cache_free(range_seg_cache, rs); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); rt->rt_space = 0; } void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { range_seg_t *rs; for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } uint64_t range_tree_space(range_tree_t *rt) { return (rt->rt_space); } boolean_t range_tree_is_empty(range_tree_t *rt) { ASSERT(rt != NULL); return (range_tree_space(rt) == 0); } uint64_t range_tree_min(range_tree_t *rt) { range_seg_t *rs = avl_first(&rt->rt_root); return (rs != NULL ? rs->rs_start : 0); } uint64_t range_tree_max(range_tree_t *rt) { range_seg_t *rs = avl_last(&rt->rt_root); return (rs != NULL ? rs->rs_end : 0); } uint64_t range_tree_span(range_tree_t *rt) { return (range_tree_max(rt) - range_tree_min(rt)); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c (revision 354383) @@ -1,623 +1,623 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Storage Pool Checkpoint * * A storage pool checkpoint can be thought of as a pool-wide snapshot or * a stable version of extreme rewind that guarantees no blocks from the * checkpointed state will have been overwritten. It remembers the entire * state of the storage pool (e.g. snapshots, dataset names, etc..) from the * point that it was taken and the user can rewind back to that point even if * they applied destructive operations on their datasets or even enabled new * zpool on-disk features. If a pool has a checkpoint that is no longer * needed, the user can discard it. * * == On disk data structures used == * * - The pool has a new feature flag and a new entry in the MOS. The feature * flag is set to active when we create the checkpoint and remains active * until the checkpoint is fully discarded. The entry in the MOS config * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that * references the state of the pool when we take the checkpoint. The entry * remains populated until we start discarding the checkpoint or we rewind * back to it. * * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, * which persists until the checkpoint is fully discarded. The space map * contains entries that have been freed in the current state of the pool * but we want to keep around in case we decide to rewind to the checkpoint. * [see vdev_checkpoint_sm] * * - Each metaslab's ms_sm space map behaves the same as without the * checkpoint, with the only exception being the scenario when we free * blocks that belong to the checkpoint. In this case, these blocks remain * ALLOCATED in the metaslab's space map and they are added as FREE in the * vdev's checkpoint space map. * * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that * the uberblock was checkpointed. For normal uberblocks this field is 0. * * == Overview of operations == * * - To create a checkpoint, we first wait for the current TXG to be synced, * so we can use the most recently synced uberblock (spa_ubsync) as the * checkpointed uberblock. Then we use an early synctask to place that * uberblock in MOS config, increment the feature flag for the checkpoint * (marking it active), and setting spa_checkpoint_txg (see its use below) * to the TXG of the checkpointed uberblock. We use an early synctask for * the aforementioned operations to ensure that no blocks were dirtied * between the current TXG and the TXG of the checkpointed uberblock * (e.g the previous txg). * * - When a checkpoint exists, we need to ensure that the blocks that * belong to the checkpoint are freed but never reused. This means that * these blocks should never end up in the ms_allocatable or the ms_freeing * trees of a metaslab. Therefore, whenever there is a checkpoint the new * ms_checkpointing tree is used in addition to the aforementioned ones. * * Whenever a block is freed and we find out that it is referenced by the * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), * we place it in the ms_checkpointing tree instead of the ms_freeingtree. * This way, we divide the blocks that are being freed into checkpointed * and not-checkpointed blocks. * * In order to persist these frees, we write the extents from the * ms_freeingtree to the ms_sm as usual, and the extents from the * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these * checkpointed extents will remain allocated in the metaslab's ms_sm space * map, and therefore won't be reused [see metaslab_sync()]. In addition, * when we discard the checkpoint, we can find the entries that have * actually been freed in vdev_checkpoint_sm. * [see spa_checkpoint_discard_thread_sync()] * * - To discard the checkpoint we use an early synctask to delete the * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, * and wakeup the discarding zthr thread (an open-context async thread). * We use an early synctask to ensure that the operation happens before any * new data end up in the checkpoint's data structures. * * Once the synctask is done and the discarding zthr is awake, we discard * the checkpointed data over multiple TXGs by having the zthr prefetching * entries from vdev_checkpoint_sm and then starting a synctask that places * them as free blocks in to their respective ms_allocatable and ms_sm * structures. * [see spa_checkpoint_discard_thread()] * * When there are no entries left in the vdev_checkpoint_sm of all * top-level vdevs, a final synctask runs that decrements the feature flag. * * - To rewind to the checkpoint, we first use the current uberblock and * open the MOS so we can access the checkpointed uberblock from the MOS * config. After we retrieve the checkpointed uberblock, we use it as the * current uberblock for the pool by writing it to disk with an updated * TXG, opening its version of the MOS, and moving on as usual from there. * [see spa_ld_checkpoint_rewind()] * * An important note on rewinding to the checkpoint has to do with how we * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL * blocks that have not been claimed by the time we took the checkpoint * as they should no longer be valid. * [see comment in zil_claim()] * * == Miscellaneous information == * * - In the hypothetical event that we take a checkpoint, remove a vdev, * and attempt to rewind, the rewind would fail as the checkpointed * uberblock would reference data in the removed device. For this reason * and others of similar nature, we disallow the following operations that * can change the config: - * vdev removal and attach/detach, mirror splitting, and pool reguid. + * vdev removal and attach/detach, mirror splitting, and pool reguid. * * - As most of the checkpoint logic is implemented in the SPA and doesn't * distinguish datasets when it comes to space accounting, having a * checkpoint can potentially break the boundaries set by dataset * reservations. */ #include #include #include #include #include #include #include #include #include #include /* * The following parameter limits the amount of memory to be used for the * prefetching of the checkpoint space map done on each vdev while * discarding the checkpoint. * * The reason it exists is because top-level vdevs with long checkpoint * space maps can potentially take up a lot of memory depending on the * amount of checkpointed data that has been freed within them while * the pool had a checkpoint. */ uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; int spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) { if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); bzero(pcs, sizeof (pool_checkpoint_stat_t)); int error = zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) pcs->pcs_state = CS_CHECKPOINT_DISCARDING; else pcs->pcs_state = CS_CHECKPOINT_EXISTS; pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; return (0); } static void spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; spa->spa_checkpoint_info.sci_timestamp = 0; spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); spa_history_log_internal(spa, "spa discard checkpoint", tx, "finished discarding checkpointed state from the pool"); } typedef struct spa_checkpoint_discard_sync_callback_arg { vdev_t *sdc_vd; uint64_t sdc_txg; uint64_t sdc_entry_limit; } spa_checkpoint_discard_sync_callback_arg_t; static int spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) { spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; vdev_t *vd = sdc->sdc_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) return (EINTR); /* * Since the space map is not condensed, we know that * none of its entries is crossing the boundaries of * its respective metaslab. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_type, ==, SM_FREE); VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * At this point we should not be processing any * other frees concurrently, so the lock is technically * unnecessary. We use the lock anyway though to * potentially save ourselves from future headaches. */ mutex_enter(&ms->ms_lock); if (range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, sme->sme_run); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; sdc->sdc_entry_limit--; return (0); } static void spa_checkpoint_accounting_verify(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t ckpoint_sm_space_sum = 0; uint64_t vs_ckpoint_space_sum = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (vd->vdev_checkpoint_sm != NULL) { ckpoint_sm_space_sum += - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vs_ckpoint_space_sum += vd->vdev_stat.vs_checkpoint_space; ASSERT3U(ckpoint_sm_space_sum, ==, vs_ckpoint_space_sum); } else { ASSERT0(vd->vdev_stat.vs_checkpoint_space); } } ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); } static void spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd = arg; int error; /* * The space map callback is applied only to non-debug entries. * Because the number of debug entries is less or equal to the * number of non-debug entries, we want to ensure that we only * read what we prefetched from open-context. * * Thus, we set the maximum entries that the space map callback * will be applied to be half the entries that could fit in the * imposed memory limit. * * Note that since this is a conservative estimate we also * assume the worst case scenario in our computation where each * entry is two-word. */ uint64_t max_entry_limit = (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; /* * Iterate from the end of the space map towards the beginning, * placing its entries on ms_freeing and removing them from the * space map. The iteration stops if one of the following * conditions is true: * * 1] We reached the beginning of the space map. At this point * the space map should be completely empty and * space_map_incremental_destroy should have returned 0. * The next step would be to free and close the space map * and remove its entry from its vdev's top zap. This allows * spa_checkpoint_discard_thread() to move on to the next vdev. * * 2] We reached the memory limit (amount of memory used to hold * space map entries in memory) and space_map_incremental_destroy * returned EINTR. This means that there are entries remaining * in the space map that will be cleared in a future invocation * of this function by spa_checkpoint_discard_thread(). */ spa_checkpoint_discard_sync_callback_arg_t sdc; sdc.sdc_vd = vd; sdc.sdc_txg = tx->tx_txg; sdc.sdc_entry_limit = max_entry_limit; uint64_t words_before = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, spa_checkpoint_discard_sync_callback, &sdc, tx); uint64_t words_after = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); #ifdef DEBUG spa_checkpoint_accounting_verify(vd->vdev_spa); #endif zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " "deleted %llu words - %llu words are left", tx->tx_txg, vd->vdev_id, (words_before - words_after), words_after); if (error != EINTR) { if (error != 0) { zfs_panic_recover("zfs: error %d was returned " "while incrementally destroying the checkpoint " "space map of vdev %llu\n", error, vd->vdev_id); } ASSERT0(words_after); - ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); + ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); } } static boolean_t spa_checkpoint_discard_is_done(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(!spa_has_checkpoint(spa)); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) return (B_FALSE); ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); } return (B_TRUE); } /* ARGSUSED */ boolean_t spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) { spa_t *spa = arg; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (B_FALSE); if (spa_has_checkpoint(spa)) return (B_FALSE); return (B_TRUE); } void spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; while (vd->vdev_checkpoint_sm != NULL) { space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; int numbufs; dmu_buf_t **dbp; if (zthr_iscancelled(zthr)) return; ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); uint64_t size = MIN(space_map_length(checkpoint_sm), zfs_spa_discard_memory_limit); uint64_t offset = space_map_length(checkpoint_sm) - size; /* * Ensure that the part of the space map that will * be destroyed by the synctask, is prefetched in * memory before the synctask runs. */ int error = dmu_buf_hold_array_by_bonus( checkpoint_sm->sm_dbuf, offset, size, B_TRUE, FTAG, &numbufs, &dbp); if (error != 0) { zfs_panic_recover("zfs: error %d was returned " "while prefetching checkpoint space map " "entries of vdev %llu\n", error, vd->vdev_id); } VERIFY0(dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_thread_sync, vd, 0, ZFS_SPACE_CHECK_NONE)); dmu_buf_rele_array(dbp, numbufs, FTAG); } } VERIFY(spa_checkpoint_discard_is_done(spa)); VERIFY0(spa->spa_checkpoint_info.sci_dspace); VERIFY0(dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_complete_sync, spa, 0, ZFS_SPACE_CHECK_NONE)); } /* ARGSUSED */ static int spa_checkpoint_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ENOTSUP)); if (!spa_top_vdevs_spacemap_addressable(spa)) return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); if (spa->spa_vdev_removal != NULL) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); return (0); } /* ARGSUSED */ static void spa_checkpoint_sync(void *arg, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; uberblock_t checkpoint = spa->spa_ubsync; /* * At this point, there should not be a checkpoint in the MOS. */ ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); ASSERT0(spa->spa_checkpoint_info.sci_timestamp); ASSERT0(spa->spa_checkpoint_info.sci_dspace); /* * Since the checkpointed uberblock is the one that just got synced * (we use spa_ubsync), its txg must be equal to the txg number of * the txg we are syncing, minus 1. */ ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); /* * Once the checkpoint is in place, we need to ensure that none of * its blocks will be marked for reuse after it has been freed. * When there is a checkpoint and a block is freed, we compare its * birth txg to the txg of the checkpointed uberblock to see if the * block is part of the checkpoint or not. Therefore, we have to set * spa_checkpoint_txg before any frees happen in this txg (which is * why this is done as an early_synctask as explained in the comment * in spa_checkpoint()). */ spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint, tx)); /* * Increment the feature refcount and thus activate the feature. * Note that the feature will be deactivated when we've * completely discarded all checkpointed state (both vdev * space maps and uberblock). */ spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); spa_history_log_internal(spa, "spa checkpoint", tx, "checkpointed uberblock txg=%llu", checkpoint.ub_txg); } /* * Create a checkpoint for the pool. */ int spa_checkpoint(const char *pool) { int error; spa_t *spa; error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); mutex_enter(&spa->spa_vdev_top_lock); /* * Wait for current syncing txg to finish so the latest synced * uberblock (spa_ubsync) has all the changes that we expect * to see if we were to revert later to the checkpoint. In other * words we want the checkpointed uberblock to include/reference * all the changes that were pending at the time that we issued * the checkpoint command. */ txg_wait_synced(spa_get_dsl(spa), 0); /* * As the checkpointed uberblock references blocks from the previous * txg (spa_ubsync) we want to ensure that are not freeing any of * these blocks in the same txg that the following synctask will * run. Thus, we run it as an early synctask, so the dirty changes * that are synced to disk afterwards during zios and other synctasks * do not reuse checkpointed blocks. */ error = dsl_early_sync_task(pool, spa_checkpoint_check, spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); mutex_exit(&spa->spa_vdev_top_lock); spa_close(spa, FTAG); return (error); } /* ARGSUSED */ static int spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); if (spa->spa_checkpoint_txg == 0) return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); VERIFY0(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); return (0); } /* ARGSUSED */ static void spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, tx)); spa->spa_checkpoint_txg = 0; zthr_wakeup(spa->spa_checkpoint_discard_zthr); spa_history_log_internal(spa, "spa discard checkpoint", tx, "started discarding checkpointed state from the pool"); } /* * Discard the checkpoint from a pool. */ int spa_checkpoint_discard(const char *pool) { /* * Similarly to spa_checkpoint(), we want our synctask to run * before any pending dirty data are written to disk so they * won't end up in the checkpoint's data structures (e.g. * ms_checkpointing and vdev_checkpoint_sm) and re-create any * space maps that the discarding open-context thread has * deleted. * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] */ return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, spa_checkpoint_discard_sync, NULL, 0, ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c (revision 354383) @@ -1,1096 +1,1068 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * Note on space map block size: * * The data for a given space map can be kept on blocks of any size. * Larger blocks entail fewer I/O operations, but they also cause the * DMU to keep more data in-core, and also to waste more I/O bandwidth * when only a few blocks have changed since the last transaction group. */ /* * Enabled whenever we want to stress test the use of double-word * space map entries. */ boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; /* * Override the default indirect block size of 128K, instead using 16K for * spacemaps (2^14 bytes). This dramatically reduces write inflation since * appending to a spacemap typically has to write one data block (4KB) and one * or two indirect blocks (16K-32K, rather than 128K). */ int space_map_ibs = 14; boolean_t sm_entry_is_debug(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); } boolean_t sm_entry_is_single_word(uint64_t e) { uint8_t prefix = SM_PREFIX_DECODE(e); return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); } boolean_t sm_entry_is_double_word(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM2_PREFIX); } /* * Iterate through the space map, invoking the callback on each (non-debug) - * space map entry. + * space map entry. Stop after reading 'end' bytes of the space map. */ int -space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) { - uint64_t sm_len = space_map_length(sm); - ASSERT3U(sm->sm_blksz, !=, 0); + uint64_t blksz = sm->sm_blksz; - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ASSERT3U(blksz, !=, 0); + ASSERT3U(end, <=, space_map_length(sm)); + ASSERT0(P2PHASE(end, sizeof (uint64_t))); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, ZIO_PRIORITY_SYNC_READ); - uint64_t blksz = sm->sm_blksz; int error = 0; - for (uint64_t block_base = 0; block_base < sm_len && error == 0; + for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; error = dmu_buf_hold(sm->sm_os, space_map_object(sm), block_base, FTAG, &db, DMU_READ_PREFETCH); if (error != 0) return (error); uint64_t *block_start = db->db_data; - uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t block_length = MIN(end - block_base, blksz); uint64_t *block_end = block_start + (block_length / sizeof (uint64_t)); VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); VERIFY3U(block_length, !=, 0); ASSERT3U(blksz, ==, db->db_size); for (uint64_t *block_cursor = block_start; block_cursor < block_end && error == 0; block_cursor++) { uint64_t e = *block_cursor; if (sm_entry_is_debug(e)) /* Skip debug entries */ continue; uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { /* it is a two-word entry */ ASSERT(sm_entry_is_double_word(e)); raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move on to the second word */ block_cursor++; e = *block_cursor; VERIFY3P(block_cursor, <=, block_end); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); ASSERT3U(entry_offset, >=, sm->sm_start); ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); ASSERT3U(entry_run, <=, sm->sm_size); ASSERT3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run }; error = callback(&sme, arg); } dmu_buf_rele(db, FTAG); } return (error); } /* * Reads the entries from the last block of the space map into * buf in reverse order. Populates nwords with number of words * in the last block. * * Refer to block comment within space_map_incremental_destroy() * to understand why this function is needed. */ static int space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, uint64_t bufsz, uint64_t *nwords) { int error = 0; dmu_buf_t *db; /* * Find the offset of the last word in the space map and use * that to read the last block of the space map with * dmu_buf_hold(). */ uint64_t last_word_offset = - sm->sm_phys->smp_objsize - sizeof (uint64_t); + sm->sm_phys->smp_length - sizeof (uint64_t); error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) return (error); ASSERT3U(sm->sm_object, ==, db->db_object); ASSERT3U(sm->sm_blksz, ==, db->db_size); ASSERT3U(bufsz, >=, db->db_size); ASSERT(nwords != NULL); uint64_t *words = db->db_data; *nwords = - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); uint64_t n = *nwords; uint64_t j = n - 1; for (uint64_t i = 0; i < n; i++) { uint64_t entry = words[i]; if (sm_entry_is_double_word(entry)) { /* * Since we are populating the buffer backwards * we have to be extra careful and add the two * words of the double-word entry in the right * order. */ ASSERT3U(j, >, 0); buf[j - 1] = entry; i++; ASSERT3U(i, <, n); entry = words[i]; buf[j] = entry; j -= 2; } else { ASSERT(sm_entry_is_debug(entry) || sm_entry_is_single_word(entry)); buf[j] = entry; j--; } } /* * Assert that we wrote backwards all the * way to the beginning of the buffer. */ ASSERT3S(j, ==, -1); dmu_buf_rele(db, FTAG); return (error); } /* * Note: This function performs destructive actions - specifically * it deletes entries from the end of the space map. Thus, callers * should ensure that they are holding the appropriate locks for * the space map that they provide. */ int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx) { uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); uint64_t *buf = zio_buf_alloc(bufsz); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * Ideally we would want to iterate from the beginning of the * space map to the end in incremental steps. The issue with this * approach is that we don't have any field on-disk that points * us where to start between each step. We could try zeroing out * entries that we've destroyed, but this doesn't work either as * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). * * As a result, we destroy its entries incrementally starting from * the end after applying the callback to each of them. * * The problem with this approach is that we cannot literally * iterate through the words in the space map backwards as we * can't distinguish two-word space map entries from their second * word. Thus we do the following: * * 1] We get all the entries from the last block of the space map * and put them into a buffer in reverse order. This way the * last entry comes first in the buffer, the second to last is * second, etc. * 2] We iterate through the entries in the buffer and we apply * the callback to each one. As we move from entry to entry we * we decrease the size of the space map, deleting effectively * each entry. * 3] If there are no more entries in the space map or the callback * returns a value other than 0, we stop iterating over the * space map. If there are entries remaining and the callback * returned 0, we go back to step [1]. */ int error = 0; while (space_map_length(sm) > 0 && error == 0) { uint64_t nwords = 0; error = space_map_reversed_last_block_entries(sm, buf, bufsz, &nwords); if (error != 0) break; ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); for (uint64_t i = 0; i < nwords; i++) { uint64_t e = buf[i]; if (sm_entry_is_debug(e)) { - sm->sm_phys->smp_objsize -= sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= sizeof (uint64_t); continue; } int words = 1; uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { ASSERT(sm_entry_is_double_word(e)); words = 2; raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move to the second word */ i++; e = buf[i]; ASSERT3P(i, <=, nwords); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); VERIFY3U(entry_offset, >=, sm->sm_start); VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); VERIFY3U(entry_run, <=, sm->sm_size); VERIFY3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run }; error = callback(&sme, arg); if (error != 0) break; if (type == SM_ALLOC) sm->sm_phys->smp_alloc -= entry_run; else sm->sm_phys->smp_alloc += entry_run; - sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= words * sizeof (uint64_t); } } if (space_map_length(sm) == 0) { ASSERT0(error); - ASSERT0(sm->sm_phys->smp_objsize); - ASSERT0(sm->sm_alloc); + ASSERT0(space_map_allocated(sm)); } zio_buf_free(buf, bufsz); return (error); } typedef struct space_map_load_arg { space_map_t *smla_sm; range_tree_t *smla_rt; maptype_t smla_type; } space_map_load_arg_t; static int space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; if (sme->sme_type == smla->smla_type) { VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); } else { range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); } return (0); } /* - * Load the space map disk into the specified range tree. Segments of maptype - * are added to the range tree, other segment types are removed. + * Load the spacemap into the rangetree, like space_map_load. But only + * read the first 'length' bytes of the spacemap. */ int -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length) { - uint64_t space; - int err; space_map_load_arg_t smla; VERIFY0(range_tree_space(rt)); - space = space_map_allocated(sm); - if (maptype == SM_FREE) { + if (maptype == SM_FREE) range_tree_add(rt, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } smla.smla_rt = rt; smla.smla_sm = sm; smla.smla_type = maptype; - err = space_map_iterate(sm, space_map_load_callback, &smla); + int err = space_map_iterate(sm, length, + space_map_load_callback, &smla); - if (err == 0) { - VERIFY3U(range_tree_space(rt), ==, space); - } else { + if (err != 0) range_tree_vacate(rt, NULL, NULL); - } return (err); } +/* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + */ +int +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +{ + return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); +} + void space_map_histogram_clear(space_map_t *sm) { if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); } boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) { /* * Verify that the in-core range tree does not have any * ranges smaller than our sm_shift size. */ for (int i = 0; i < sm->sm_shift; i++) { if (rt->rt_histogram[i] != 0) return (B_FALSE); } return (B_TRUE); } void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(space_map_object(sm), !=, 0); if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; dmu_buf_will_dirty(sm->sm_dbuf, tx); ASSERT(space_map_histogram_verify(sm, rt)); /* * Transfer the content of the range tree histogram to the space * map histogram. The space map histogram contains 32 buckets ranging * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, * however, can represent ranges from 2^0 to 2^63. Since the space * map only cares about allocatable blocks (minimum of sm_shift) we * can safely ignore all ranges in the range tree smaller than sm_shift. */ for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { /* * Since the largest histogram bucket in the space map is * 2^(32+sm_shift-1), we need to normalize the values in * the range tree for any bucket larger than that size. For * example given an sm_shift of 9, ranges larger than 2^40 * would get normalized as if they were 1TB ranges. Assume * the range tree had a count of 5 in the 2^44 (16TB) bucket, * the calculation below would normalize this to 5 * 2^4 (16). */ ASSERT3U(i, >=, idx + sm->sm_shift); sm->sm_phys->smp_histogram[idx] += rt->rt_histogram[i] << (i - idx - sm->sm_shift); /* * Increment the space map's index as long as we haven't * reached the maximum bucket size. Accumulate all ranges * larger than the max bucket size into the last bucket. */ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + sm->sm_shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } static void space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) { dmu_buf_will_dirty(sm->sm_dbuf, tx); uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(maptype) | SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, sizeof (dentry), &dentry, tx); - sm->sm_phys->smp_objsize += sizeof (dentry); + sm->sm_phys->smp_length += sizeof (dentry); } /* * Writes one or more entries given a segment. * * Note: The function may release the dbuf from the pointer initially * passed to it, and return a different dbuf. Also, the space map's * dbuf must be dirty for the changes in sm_phys to take effect. */ static void space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); /* ensure the vdev_id can be represented by the space map */ ASSERT3U(vdev_id, <=, SM_NO_VDEVID); /* * if this is a single word entry, ensure that no vdev was * specified. */ IMPLY(words == 1, vdev_id == SM_NO_VDEVID); dmu_buf_t *db = *dbp; ASSERT3U(db->db_size, ==, sm->sm_blksz); uint64_t *block_base = db->db_data; uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); uint64_t *block_cursor = block_base + - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3P(block_cursor, <=, block_end); uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; ASSERT3U(rs->rs_start, >=, sm->sm_start); ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); /* * If we are at the end of this block, flush it and start * writing again from the beginning. */ if (block_cursor == block_end) { dmu_buf_rele(db, tag); - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, tag, &db, DMU_READ_PREFETCH)); dmu_buf_will_dirty(db, tx); /* update caller's dbuf */ *dbp = db; ASSERT3U(db->db_size, ==, sm->sm_blksz); block_base = db->db_data; block_cursor = block_base; block_end = block_base + (db->db_size / sizeof (uint64_t)); } /* * If we are writing a two-word entry and we only have one * word left on this block, just pad it with an empty debug * entry and write the two-word entry in the next block. */ uint64_t *next_entry = block_cursor + 1; if (next_entry == block_end && words > 1) { ASSERT3U(words, ==, 2); *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(0) | SM_DEBUG_SYNCPASS_ENCODE(0) | SM_DEBUG_TXG_ENCODE(0); block_cursor++; - sm->sm_phys->smp_objsize += sizeof (uint64_t); + sm->sm_phys->smp_length += sizeof (uint64_t); ASSERT3P(block_cursor, ==, block_end); continue; } uint64_t run_len = MIN(size, run_max); switch (words) { case 1: *block_cursor = SM_OFFSET_ENCODE(start) | SM_TYPE_ENCODE(maptype) | SM_RUN_ENCODE(run_len); block_cursor++; break; case 2: /* write the first word of the entry */ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | SM2_RUN_ENCODE(run_len) | SM2_VDEV_ENCODE(vdev_id); block_cursor++; /* move on to the second word of the entry */ ASSERT3P(block_cursor, <, block_end); *block_cursor = SM2_TYPE_ENCODE(maptype) | SM2_OFFSET_ENCODE(start); block_cursor++; break; default: panic("%d-word space map entries are not supported", words); break; } - sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + sm->sm_phys->smp_length += words * sizeof (uint64_t); start += run_len; size -= run_len; } ASSERT0(size); } /* * Note: The space map's dbuf must be dirty for the changes in sm_phys to * take effect. */ static void space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dmu_buf_t *db; space_map_write_intro_debug(sm, maptype, tx); #ifdef DEBUG /* * We do this right after we write the intro debug entry * because the estimate does not take it into account. */ - uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t initial_objsize = sm->sm_phys->smp_length; uint64_t estimated_growth = space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); uint64_t estimated_final_objsize = initial_objsize + estimated_growth; #endif /* * Find the offset right after the last word in the space map * and use that to get a hold of the last block, so we can * start appending to it. */ - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); ASSERT3U(db->db_size, ==, sm->sm_blksz); dmu_buf_will_dirty(db, tx); avl_tree_t *t = &rt->rt_root; for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; uint8_t words = 1; /* * We only write two-word entries when both of the following * are true: * * [1] The feature is enabled. * [2] The offset or run is too big for a single-word entry, * or the vdev_id is set (meaning not equal to * SM_NO_VDEVID). * * Note that for purposes of testing we've added the case that * we write two-word entries occasionally when the feature is * enabled and zfs_force_some_double_word_sm_entries has been * set. */ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && (offset >= (1ULL << SM_OFFSET_BITS) || length > SM_RUN_MAX || vdev_id != SM_NO_VDEVID || (zfs_force_some_double_word_sm_entries && spa_get_random(100) == 0))) words = 2; space_map_write_seg(sm, rs, maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); #ifdef DEBUG /* * We expect our estimation to be based on the worst case * scenario [see comment in space_map_estimate_optimal_size()]. * Therefore we expect the actual objsize to be equal or less * than whatever we estimated it to be. */ - ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); #endif } /* * Note: This function manipulates the state of the given space map but * does not hold any locks implicitly. Thus the caller is responsible * for synchronizing writes to the space map. */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { objset_t *os = sm->sm_os; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); VERIFY3U(space_map_object(sm), !=, 0); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * This field is no longer necessary since the in-core space map * now contains the object number but is maintained for backwards * compatibility. */ sm->sm_phys->smp_object = sm->sm_object; if (range_tree_is_empty(rt)) { VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; } if (maptype == SM_ALLOC) sm->sm_phys->smp_alloc += range_tree_space(rt); else sm->sm_phys->smp_alloc -= range_tree_space(rt); uint64_t nodes = avl_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); /* * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } static int space_map_open_impl(space_map_t *sm) { int error; u_longlong_t blocks; error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); if (error) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); sm->sm_phys = sm->sm_dbuf->db_data; return (0); } int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift) { space_map_t *sm; int error; ASSERT(*smp == NULL); ASSERT(os != NULL); ASSERT(object != 0); sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); sm->sm_start = start; sm->sm_size = size; sm->sm_shift = shift; sm->sm_os = os; sm->sm_object = object; error = space_map_open_impl(sm); if (error != 0) { space_map_close(sm); return (error); } *smp = sm; return (0); } void space_map_close(space_map_t *sm) { if (sm == NULL) return; if (sm->sm_dbuf != NULL) dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_dbuf = NULL; sm->sm_phys = NULL; kmem_free(sm, sizeof (*sm)); } void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) { objset_t *os = sm->sm_os; spa_t *spa = dmu_objset_spa(os); dmu_object_info_t doi; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); dmu_object_info_from_db(sm->sm_dbuf, &doi); /* * If the space map has the wrong bonus size (because * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or * the wrong block size (because space_map_blksz has changed), * free and re-allocate its object with the updated sizes. * * Otherwise, just truncate the current object. */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || doi.doi_data_block_size != blocksize || doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, doi.doi_bonus_size, doi.doi_data_block_size); space_map_free(sm, tx); dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); VERIFY0(space_map_open_impl(sm)); } else { VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); /* * If the spacemap is reallocated, its histogram * will be reset. Do the same in the common case so that * bugs related to the uncommon case do not go unnoticed. */ bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); } dmu_buf_will_dirty(sm->sm_dbuf, tx); - sm->sm_phys->smp_objsize = 0; + sm->sm_phys->smp_length = 0; sm->sm_phys->smp_alloc = 0; } -/* - * Update the in-core space_map allocation and length values. - */ -void -space_map_update(space_map_t *sm) -{ - if (sm == NULL) - return; - - sm->sm_alloc = sm->sm_phys->smp_alloc; - sm->sm_length = sm->sm_phys->smp_objsize; -} - uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); uint64_t object; int bonuslen; if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); bonuslen = sizeof (space_map_phys_t); ASSERT3U(bonuslen, <=, dmu_bonus_max()); } else { bonuslen = SPACE_MAP_SIZE_V0; } object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, smobj, &doi)); if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { spa_feature_decr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } VERIFY0(dmu_object_free(os, smobj, tx)); } void space_map_free(space_map_t *sm, dmu_tx_t *tx) { if (sm == NULL) return; space_map_free_obj(sm->sm_os, space_map_object(sm), tx); sm->sm_object = 0; } /* * Given a range tree, it makes a worst-case estimate of how much * space would the tree's segments take if they were written to * the given space map. */ uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, uint64_t vdev_id) { spa_t *spa = dmu_objset_spa(sm->sm_os); uint64_t shift = sm->sm_shift; uint64_t *histogram = rt->rt_histogram; uint64_t entries_for_seg = 0; /* * In order to get a quick estimate of the optimal size that this * range tree would have on-disk as a space map, we iterate through * its histogram buckets instead of iterating through its nodes. * * Note that this is a highest-bound/worst-case estimate for the * following reasons: * * 1] We assume that we always add a debug padding for each block * we write and we also assume that we start at the last word * of a block attempting to write a two-word entry. * 2] Rounding up errors due to the way segments are distributed * in the buckets of the range tree's histogram. * 3] The activation of zfs_force_some_double_word_sm_entries * (tunable) when testing. * * = Math and Rounding Errors = * * rt_histogram[i] bucket of a range tree represents the number * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given * that, we want to divide the buckets into groups: Buckets that * can be represented using a single-word entry, ones that can * be represented with a double-word entry, and ones that can * only be represented with multiple two-word entries. * * [Note that if the new encoding feature is not enabled there * are only two groups: single-word entry buckets and multiple * single-word entry buckets. The information below assumes * two-word entries enabled, but it can easily applied when * the feature is not enabled] * * To find the highest bucket that can be represented with a * single-word entry we look at the maximum run that such entry * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that * the run of a space map entry is shifted by sm_shift, thus we * add it to the exponent]. This way, excluding the value of the * maximum run that can be represented by a single-word entry, * all runs that are smaller exist in buckets 0 to * SM_RUN_BITS + shift - 1. * * To find the highest bucket that can be represented with a * double-word entry, we follow the same approach. Finally, any * bucket higher than that are represented with multiple two-word * entries. To be more specific, if the highest bucket whose * segments can be represented with a single two-word entry is X, * then bucket X+1 will need 2 two-word entries for each of its * segments, X+2 will need 4, X+3 will need 8, ...etc. * * With all of the above we make our estimation based on bucket * groups. There is a rounding error though. As we mentioned in * the example with the one-word entry, the maximum run that can * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of * that length fall into the next bucket (and bucket group) where * we start counting two-word entries and this is one more reason * why the estimated size may end up being bigger than the actual * size written. */ uint64_t size = 0; uint64_t idx = 0; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { /* * If we are trying to force some double word entries just * assume the worst-case of every single word entry being * written as a double word entry. */ uint64_t entry_size = (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && zfs_force_some_double_word_sm_entries) ? (2 * sizeof (uint64_t)) : sizeof (uint64_t); uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; for (; idx <= single_entry_max_bucket; idx++) size += histogram[idx] * entry_size; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, single_entry_max_bucket); entries_for_seg = 1ULL << (idx - single_entry_max_bucket); size += histogram[idx] * entries_for_seg * entry_size; } return (size); } } ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; for (; idx <= double_entry_max_bucket; idx++) size += histogram[idx] * 2 * sizeof (uint64_t); for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, double_entry_max_bucket); entries_for_seg = 1ULL << (idx - double_entry_max_bucket); size += histogram[idx] * entries_for_seg * 2 * sizeof (uint64_t); } /* * Assume the worst case where we start with the padding at the end * of the current block and we add an extra padding entry at the end * of all subsequent blocks. */ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); return (size); } uint64_t space_map_object(space_map_t *sm) { return (sm != NULL ? sm->sm_object : 0); } -/* - * Returns the already synced, on-disk allocated space. - */ -uint64_t +int64_t space_map_allocated(space_map_t *sm) { - return (sm != NULL ? sm->sm_alloc : 0); + return (sm != NULL ? sm->sm_phys->smp_alloc : 0); } -/* - * Returns the already synced, on-disk length; - */ uint64_t space_map_length(space_map_t *sm) { - return (sm != NULL ? sm->sm_length : 0); -} - -/* - * Returns the allocated space that is currently syncing. - */ -int64_t -space_map_alloc_delta(space_map_t *sm) -{ - if (sm == NULL) - return (0); - ASSERT(sm->sm_dbuf != NULL); - return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); + return (sm != NULL ? sm->sm_phys->smp_length : 0); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h (revision 354383) @@ -1,123 +1,126 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_METASLAB_H #define _SYS_METASLAB_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; extern metaslab_ops_t *zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); +uint64_t metaslab_allocated_space(metaslab_t *); + void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); /* * metaslab alloc flags */ #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_alloc_trace_init(void); void metaslab_alloc_trace_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); void metaslab_class_destroy(metaslab_class_t *); int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); +void metaslab_recalculate_weight_and_sort(metaslab_t *); #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h (revision 354383) @@ -1,421 +1,500 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Metaslab allocation tracing record. */ typedef struct metaslab_alloc_trace { list_node_t mat_list_node; metaslab_group_t *mat_mg; metaslab_t *mat_msp; uint64_t mat_size; uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; int mat_allocator; } metaslab_alloc_trace_t; /* * Used by the metaslab allocation tracing facility to indicate * error conditions. These errors are stored to the offset member * of the metaslab_alloc_trace_t record and displayed by mdb. */ typedef enum trace_alloc_type { TRACE_ALLOC_FAILURE = -1ULL, TRACE_TOO_SMALL = -2ULL, TRACE_FORCE_GANG = -3ULL, TRACE_NOT_ALLOCATABLE = -4ULL, TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, TRACE_VDEV_ERROR = -8ULL, TRACE_INITIALIZING = -9ULL } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) #define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the * metaslabs by weight. The weight (and therefore the "best" metaslab) can * be determined in two different ways: by computing a weighted sum of all * the free space in the metaslab (a space based weight) or by counting only * the free segments of the largest size (a segment based weight). We prefer * the segment based weight because it reflects how the free space is * comprised, but we cannot always use it -- legacy pools do not have the * space map histogram information necessary to determine the largest * contiguous regions. Pools that have the space map histogram determine * the segment weight by looking at each bucket in the histogram and * determining the free space whose size in bytes is in the range: * [2^i, 2^(i+1)) * We then encode the largest index, i, that contains regions into the * segment-weighted value. * * Space-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ ((weight) == 0 || BF64_GET((weight), 60, 1)) #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include * "normal" for data block allocations (i.e. main pool allocations) or "log" * for allocations designated for intent log devices (i.e. slog devices). * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done * by traversing the metaslab groups that are linked off of the mc_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab * class defines the low-level block allocator that will be used as the * final step in allocation. These allocators are pluggable allowing each class * to use a block allocator that best suits that class. */ struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; uint64_t mc_aliquot; /* * Track the number of metaslab groups that have been initialized * and can accept allocations. An initialized metaslab group is * one has been completely added to the config (i.e. we have * updated the MOS config and the space has been added to the pool). */ uint64_t mc_groups; /* * Toggle to enable/disable the allocation throttle. */ boolean_t mc_alloc_throttle_enabled; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must * first reserve the number of blocks that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current * number of reserved allocations is maintained by the mc_alloc_slots * refcount. The mc_alloc_max_slots value determines the maximum * number of allocations that the system allows. Gang blocks are * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ uint64_t *mc_alloc_max_slots; zfs_refcount_t *mc_alloc_slots; uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked togther to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will * simply find the next metaslab group in the linked list and attempt * to allocate from that group instead. */ struct metaslab_group { kmutex_t mg_lock; metaslab_t **mg_primaries; metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after * we have updated the MOS config and added the space to the pool. * We only allow allocation attempts to a metaslab group if it * has been initialized. */ boolean_t mg_initialized; uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; /* * In order for the allocation throttle to function properly, we cannot * have too many IOs going to each disk by default; the throttle * operates by allocating more work to disks that finish quickly, so * allocating larger chunks to each disk reduces its effectiveness. * However, if the number of IOs going to each allocator is too small, * we will not perform proper aggregation at the vdev_queue layer, * also resulting in decreased performance. Therefore, we will use a * ramp-up strategy. * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given * allocator on a given metaslab group, we increment its cur_max until * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to * help protect against disks that decrease in performance over time. * * It's possible for an allocator to handle more allocations than * its max. This can occur when gang blocks are required or when other * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; uint64_t *mg_cur_max_alloc_queue_depth; zfs_refcount_t *mg_alloc_queue_depth; int mg_allocators; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out * of space then its share of work must be distributed to other * groups. */ boolean_t mg_no_free_space; uint64_t mg_allocations; uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_initializing; boolean_t mg_initialize_updating; kmutex_t mg_ms_initialize_lock; kcondvar_t mg_ms_initialize_cv; }; /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 /* * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are * allocated, the allocated segment are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees * (ms_defer). Once a block is freed it will move from the * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. * A pool could be safely rolled back TXG_DEFERS_SIZE transactions * groups and ensure that no block has been reallocated. * * The simplified transition diagram looks like this: * * * ALLOCATE * | * V * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ * | ms_freeing <--- FREE * | | * | v * | ms_freed * | | * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map. The * pool space is only updated once all metaslabs have finished syncing. * * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These * segments are represented in-core by the ms_allocatable and are stored * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core * free tree is zfs_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is * being written. */ struct metaslab { + /* + * This is the main lock of the metaslab and its purpose is to + * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * metaslab_free_concrete(), ..etc] with our various syncing + * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * + * The lock is also used during some miscellaneous operations like + * using the metaslab's histogram for the metaslab group's histogram + * aggregation, or marking the metaslab for initialization. + */ kmutex_t ms_lock; + + /* + * Acquired together with the ms_lock whenever we expect to + * write to metaslab data on-disk (i.e flushing entries to + * the metaslab's space map). It helps coordinate readers of + * the metaslab's space map [see spa_vdev_remove_thread()] + * with writers [see metaslab_sync()]. + * + * Note that metaslab_load(), even though a reader, uses + * a completely different mechanism to deal with the reading + * of the metaslab's space map based on ms_synced_length. That + * said, the function still uses the ms_sync_lock after it + * has read the ms_sm [see relevant comment in metaslab_load() + * as to why]. + */ kmutex_t ms_sync_lock; + kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; uint64_t ms_fragmentation; range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; + uint64_t ms_allocated_this_txg; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ range_tree_t *ms_freeing; /* to free this syncing txg */ range_tree_t *ms_freed; /* already freed this syncing txg */ range_tree_t *ms_defer[TXG_DEFER_SIZE]; range_tree_t *ms_checkpointing; /* to add to the checkpoint */ boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; uint64_t ms_condense_checked_txg; uint64_t ms_initializing; /* leaves initializing this ms */ /* * We must always hold the ms_lock when modifying ms_loaded * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; + /* + * The following histograms count entries that are in the + * metaslab's space map (and its histogram) but are not in + * ms_allocatable yet, because they are in ms_freed, ms_freeing, + * or ms_defer[]. + * + * When the metaslab is not loaded, its ms_weight needs to + * reflect what is allocatable (i.e. what will be part of + * ms_allocatable if it is loaded). The weight is computed from + * the spacemap histogram, but that includes ranges that are + * not yet allocatable (because they are in ms_freed, + * ms_freeing, or ms_defer[]). Therefore, when calculating the + * weight, we need to remove those ranges. + * + * The ranges in the ms_freed and ms_defer[] range trees are all + * present in the spacemap. However, the spacemap may have + * multiple entries to represent a contiguous range, because it + * is written across multiple sync passes, but the changes of + * all sync passes are consolidated into the range trees. + * Adjacent ranges that are freed in different sync passes of + * one txg will be represented separately (as 2 or more entries) + * in the space map (and its histogram), but these adjacent + * ranges will be consolidated (represented as one entry) in the + * ms_freed/ms_defer[] range trees (and their histograms). + * + * When calculating the weight, we can not simply subtract the + * range trees' histograms from the spacemap's histogram, + * because the range trees' histograms may have entries in + * higher buckets than the spacemap, due to consolidation. + * Instead we must subtract the exact entries that were added to + * the spacemap's histogram. ms_synchist and ms_deferhist[] + * represent these exact entries, so we can subtract them from + * the spacemap's histogram when calculating ms_weight. + * + * ms_synchist represents the same ranges as ms_freeing + + * ms_freed, but without consolidation across sync passes. + * + * ms_deferhist[i] represents the same ranges as ms_defer[i], + * but without consolidation across sync passes. + */ + uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; + uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; + + /* + * Tracks the exact amount of allocated space of this metaslab + * (and specifically the metaslab's space map) up to the most + * recently completed sync pass [see usage in metaslab_sync()]. + */ + uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ /* * Track of whenever a metaslab is selected for loading or allocation. * We use this value to determine how long the metaslab should * stay cached. */ uint64_t ms_selected_txg; uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* * -1 if it's not active in an allocator, otherwise set to the allocator * this metaslab is active for. */ int ms_allocator; boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always * contain the same number of segments as the ms_allocatable. The * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ avl_tree_t ms_allocatable_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + + /* updated every time we are done syncing the metaslab's space map */ + uint64_t ms_synced_length; boolean_t ms_new; }; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_IMPL_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h (revision 354383) @@ -1,104 +1,105 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H #include #include #ifdef __cplusplus extern "C" { #endif #define RANGE_TREE_HISTOGRAM_SIZE 64 typedef struct range_tree_ops range_tree_ops_t; /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { avl_tree_t rt_root; /* offset-ordered segment AVL tree */ uint64_t rt_space; /* sum of all segments in the map */ range_tree_ops_t *rt_ops; void *rt_arg; /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: * 2^i <= size of range in bytes < 2^(i+1) */ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; typedef struct range_seg { avl_node_t rs_node; /* AVL node */ avl_node_t rs_pp_node; /* AVL picker-private node */ uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ } range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); void range_tree_init(void); void range_tree_fini(void); range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_verify_not_present(range_tree_t *rt, + uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); -void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); uint64_t range_tree_min(range_tree_t *rt); uint64_t range_tree_max(range_tree_t *rt); uint64_t range_tree_span(range_tree_t *rt); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); #ifdef __cplusplus } #endif #endif /* _SYS_RANGE_TREE_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h (revision 354383) @@ -1,225 +1,230 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* * The size of the space map object has increased to include a histogram. * The SPACE_MAP_SIZE_V0 designates the original size and is used to * maintain backward compatibility. */ #define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) #define SPACE_MAP_HISTOGRAM_SIZE 32 /* * The space_map_phys is the on-disk representation of the space map. * Consumers of space maps should never reference any of the members of this * structure directly. These members may only be updated in syncing context. * * Note the smp_object is no longer used but remains in the structure * for backward compatibility. */ typedef struct space_map_phys { - uint64_t smp_object; /* on-disk space map object */ - uint64_t smp_objsize; /* size of the object */ - int64_t smp_alloc; /* space allocated from the map */ - uint64_t smp_pad[5]; /* reserved */ + /* object number: not needed but kept for backwards compatibility */ + uint64_t smp_object; + /* length of the object in bytes */ + uint64_t smp_length; + + /* space allocated from the map */ + int64_t smp_alloc; + + /* reserved */ + uint64_t smp_pad[5]; + /* * The smp_histogram maintains a histogram of free regions. Each * bucket, smp_histogram[i], contains the number of free regions * whose size is: * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) */ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; } space_map_phys_t; /* * The space map object defines a region of space, its size, how much is * allocated, and the on-disk object that stores this information. * Consumers of space maps may only access the members of this structure. * * Note: the space_map may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct space_map { uint64_t sm_start; /* start of map */ uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ - uint64_t sm_length; /* synced length */ - int64_t sm_alloc; /* synced space allocated */ objset_t *sm_os; /* objset for this map */ uint64_t sm_object; /* object id for this map */ uint32_t sm_blksz; /* block size for space map */ dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ space_map_phys_t *sm_phys; /* on-disk space map */ } space_map_t; /* * debug entry * * 2 2 10 50 * +-----+-----+------------+----------------------------------+ * | 1 0 | act | syncpass | txg (lower bits) | * +-----+-----+------------+----------------------------------+ * 63 62 61 60 59 50 49 0 * * * one-word entry * * 1 47 1 15 * +-----------------------------------------------------------+ * | 0 | offset (sm_shift units) | type | run | * +-----------------------------------------------------------+ * 63 62 16 15 14 0 * * * two-word entry * * 2 2 36 24 * +-----+-----+---------------------------+-------------------+ * | 1 1 | pad | run | vdev | * +-----+-----+---------------------------+-------------------+ * 63 62 61 60 59 24 23 0 * * 1 63 * +------+----------------------------------------------------+ * | type | offset | * +------+----------------------------------------------------+ * 63 62 0 * * Note that a two-word entry will not strandle a block boundary. * If necessary, the last word of a block will be padded with a * debug entry (with act = syncpass = txg = 0). */ typedef enum { SM_ALLOC, SM_FREE } maptype_t; typedef struct space_map_entry { maptype_t sme_type; uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ uint64_t sme_run; /* max is 2^36; units of sm_shift */ } space_map_entry_t; #define SM_NO_VDEVID (1 << SPA_VDEVBITS) /* one-word entry constants */ #define SM_DEBUG_PREFIX 2 #define SM_OFFSET_BITS 47 #define SM_RUN_BITS 15 /* two-word entry constants */ #define SM2_PREFIX 3 #define SM2_OFFSET_BITS 63 #define SM2_RUN_BITS 36 #define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) #define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) #define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) #define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) #define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) #define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) #define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) #define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) #define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) #define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) #define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) #define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) #define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) #define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) #define SM_RUN_MAX SM_RUN_DECODE(~0ULL) #define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) #define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) #define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) #define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) #define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) #define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) #define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) #define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) #define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) #define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) #define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) boolean_t sm_entry_is_debug(uint64_t e); boolean_t sm_entry_is_single_word(uint64_t e); boolean_t sm_entry_is_double_word(uint64_t e); typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); -int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); +int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length); +int space_map_iterate(space_map_t *sm, uint64_t length, + sm_cb_t callback, void *arg); int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx); +boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx); -void space_map_update(space_map_t *sm); - uint64_t space_map_object(space_map_t *sm); -uint64_t space_map_allocated(space_map_t *sm); +int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx); uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, uint64_t vdev_id); void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift); void space_map_close(space_map_t *sm); - -int64_t space_map_alloc_delta(space_map_t *sm); #ifdef __cplusplus } #endif #endif /* _SYS_SPACE_MAP_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h (revision 354383) @@ -1,524 +1,513 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern int zfs_vdev_queue_depth_pct; extern int zfs_vdev_def_queue_depth; extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, range_seg_t *res); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; /* * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. * Used when initializing vdevs. Isn't used by leaf ops. */ vdev_xlation_func_t *vdev_op_xlate; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ struct vdev_cache_entry { struct abd *ve_abd; uint64_t ve_offset; uint64_t ve_lastused; avl_node_t ve_offset_node; avl_node_t ve_lastused_node; uint32_t ve_hits; uint16_t ve_missed_update; zio_t *ve_fill_io; }; struct vdev_cache { avl_tree_t vc_offset_tree; avl_tree_t vc_lastused_tree; kmutex_t vc_lock; }; typedef struct vdev_queue_class { uint32_t vqc_active; /* * Sorted by offset or timestamp, depending on if the queue is * LBA-ordered vs FIFO. */ avl_tree_t vqc_queued_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; typedef enum vdev_alloc_bias { VDEV_BIAS_NONE, VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ } vdev_alloc_bias_t; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vnode_t *vdev_name_vp; /* vnode for pathname */ vnode_t *vdev_devid_vp; /* vnode for devid */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ uint64_t vdev_crtxg; /* txg when top-level was added */ /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ boolean_t vdev_ishole; /* is a hole in the namespace */ - kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; range_tree_t *vdev_initialize_tree; /* valid while initializing */ uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; time_t vdev_initialize_action_time; /* start and end time */ /* for limiting outstanding I/Os */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; - - /* - * The queue depth parameters determine how many async writes are - * still pending (i.e. allocated but not yet issued to disk) per - * top-level (vdev_async_write_queue_depth) and the maximum allowed - * (vdev_max_async_write_queue_depth). These values only apply to - * top-level vdevs. - */ - uint64_t vdev_async_write_queue_depth; - uint64_t vdev_max_async_write_queue_depth; /* * Leaf vdev state. */ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock * ring when MMP is enabled. */ #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); /* * Global variables */ extern int vdev_standard_sm_blksz; /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd); extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd); /* * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. */ typedef struct vdev_buf { buf_t vb_buf; /* buffer that describes the io */ zio_t *vb_io; /* pointer back to the original zio_t */ } vdev_buf_t; #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 354383) @@ -1,4338 +1,4285 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* maximum scrub/resilver I/O queue per leaf vdev */ int zfs_scrub_limit = 10; /* default target for number of metaslabs per top-level vdev */ int zfs_vdev_default_ms_count = 200; /* minimum number of metaslabs per top-level vdev */ int zfs_vdev_min_ms_count = 16; /* practical upper limit of total metaslabs per top-level vdev */ int zfs_vdev_ms_count_limit = 1ULL << 17; /* lower limit for metaslab size (512M) */ int zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ int zfs_vdev_max_ms_shift = 34; boolean_t vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ int vdev_dtl_sm_blksz = (1 << 12); /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ int vdev_standard_sm_blksz = (1 << 17); int zfs_ashift_min; /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); if (vd->vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Derive the enumerated alloction bias from string input. * String origin is either the per-vdev zap or zpool(1M). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { return (vdev_count_leaves_impl(spa->spa_root_vdev)); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; spa_t *spa = cvd->vdev_spa; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_zalloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); list_link_init(&vd->vdev_leaf_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; vdev_indirect_config_t *vic; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); /* * If creating a top-level vdev, check for allocation classes input */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } } vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); - mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } static void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); /* * The spa ashift values currently only reflect the * general vdev classes. Class destination is late * binding so ashift checking had to wait until now */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } #ifndef _KERNEL /* * To accomodate zdb_leak_init() fake indirect * metaslabs, we allocate a metaslab group for * indirect vdevs which normally don't have one. */ if (vd->vdev_mg == NULL) { ASSERT0(vdev_is_concrete(vd)); vdev_metaslab_group_create(vd); } #endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { - uint64_t count = vd->vdev_ms_count; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); - metaslab_group_passivate(vd->vdev_mg); + uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp != NULL) metaslab_fini(msp); } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; + + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } boolean_t vdev_uses_zvols(vdev_t *vd) { if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); return; } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != NULL); taskq_destroy(tq); } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); vd->vdev_ashift = MAX(zfs_ashift_min, vd->vdev_ashift); } else { /* * Detect if the alignment requirement has increased. * We don't want to make the pool unavailable, just * issue a warning instead. */ if (ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { cmn_err(CE_WARN, "Disk, '%s', has a block alignment that is " "larger than the pool's alignment\n", vd->vdev_path); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. * * DJB - TBD these should perhaps be tracked per allocation class * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_alloc_bias == VDEV_BIAS_NONE && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; if (vdev_validate_skip) return (0); for (uint64_t c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * Ignore the DTLs and try all devices. This avoids a recursive * mutex enter on the vdev_dtl_lock, and also makes us try hard * when loading the pool (relying on the checksum to ensure that * we get the right data -- note that we while loading, we are * only reading the MOS, which is always checksummed). */ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_txg == 0 || range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag. */ if (vd->vdev_resilver_txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) vd->vdev_resilver_txg = 0; mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); - - /* - * Now that we've opened the space_map we need to update - * the in-core DTL. - */ - space_map_update(vd->vdev_dtl_sm); - error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, NULL); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_update(vd->vdev_dtl_sm); - mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. * Returns the spacemap object, or 0 if it wasn't in the ZAP * or the ZAP doesn't exist yet. */ int vdev_checkpoint_sm_object(vdev_t *vd) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { return (0); } uint64_t sm_obj = 0; int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); ASSERT(err == 0 || err == ENOENT); return (sm_obj); } int vdev_load(vdev_t *vd) { int error = 0; /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { error = vdev_load(vd->vdev_child[c]); if (error != 0) { return (error); } } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str) == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); - } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + } + + error = vdev_metaslab_init(vd, 0); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); if (checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); - if ((error = space_map_open(&vd->vdev_checkpoint_sm, + error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, - vd->vdev_ashift))) { + vd->vdev_ashift); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries - * exclusively we can use sm_alloc to indicate the - * culmulative checkpointed space that has been freed. + * exclusively we can use space_map_allocated() to + * indicate the cumulative checkpointed space that + * has been freed. */ vd->vdev_stat.vs_checkpoint_space = - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } - space_map_update(vd->vdev_obsolete_sm); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); - if (vd->vdev_ms != NULL) { - metaslab_group_t *mg = vd->vdev_mg; - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - if (msp == NULL || msp->ms_sm == NULL) - continue; - - mutex_enter(&msp->ms_lock); - /* - * If the metaslab was not loaded when the vdev - * was removed then the histogram accounting may - * not be accurate. Update the histogram information - * here so that we ensure that the metaslab group - * and metaslab class are up-to-date. - */ - metaslab_group_histogram_remove(mg, msp); - - VERIFY0(space_map_allocated(msp->ms_sm)); - space_map_close(msp->ms_sm); - msp->ms_sm = NULL; - mutex_exit(&msp->ms_lock); - } - - if (vd->vdev_checkpoint_sm != NULL) { - ASSERT(spa_has_checkpoint(spa)); - space_map_close(vd->vdev_checkpoint_sm); - vd->vdev_checkpoint_sm = NULL; - } - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - ASSERT0(mg->mg_histogram[i]); - } - dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; - dmu_tx_t *tx; + ASSERT3U(txg, ==, spa->spa_syncing_txg); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { - dmu_tx_t *tx; - ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vdev_indirect_sync_obsolete(vd, tx); - dmu_tx_commit(tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); - dmu_tx_commit(tx); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); + dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { - ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, - !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in reponse to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd = vd->vdev_top; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report intializing progress. Since we don't have the * initializing locks held, this is only an estimate (although a * fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; } /* * Report expandable space on top-level, non-auxillary devices only. * The expandable space is reported in terms of metaslab sized units * since that determines how much space the pool can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } } mutex_exit(&vd->vdev_stat_lock); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } vs->vs_ops[type]++; vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); /* * If we have brought this vdev back into service, we need * to notify fmd so that it can gracefully repair any outstanding * cases due to a missing device. We do this in all cases, even those * that probably don't correlate to a repaired fault. This is sure to * catch all cases, and we let the zfs-retire agent sort it out. If * this is a transient state it's OK, as the retire agent will * double-check the state of the vdev before repairing it. */ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && vd->vdev_prevstate != state) zfs_post_state_change(spa, vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. * In addition, only a single top-level vdev is allowed. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we panic the system. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { vdev_dbgmsg(vd, "SLOW IO: zio timestamp " "%lluns, delta %lluns, last io %lluns", fio->io_timestamp, (u_longlong_t)delta, vq->vq_io_complete_ts); fm_panic("I/O to pool '%s' appears to be " "hung.", spa_name(spa)); } } mutex_exit(&vq->vq_lock); } } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c (revision 354383) @@ -1,1830 +1,1827 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - i/os to this vdev use the callback to determine where the * data is now located, and issue child i/os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ int zfs_indirect_condense_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ int zfs_condense_indirect_commit_entry_delay_ticks = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ int zfs_reconstruct_indirect_combinations_max = 256; /* * Enable to simulate damaged segments and validate reconstruction. * Used by ztest */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[1]; /* variable-length */ } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } list_remove(&iv->iv_splits, is); indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { vdev_indirect_map_free, zio_vsd_default_cksum_report }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ if (vd->vdev_obsolete_sm == NULL) { ASSERT0(vdev_obsolete_sm_object(vd)); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_indirect_condense_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), spa_condense_indirect_commit_sync, sci, 0, ZFS_SPACE_CHECK_NONE, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ delay(zfs_condense_indirect_commit_entry_delay_ticks); } mapi++; } } /* ARGSUSED */ static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } /* ARGSUSED */ static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); ASSERT(obsolete_sm_obj != 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", vd->vdev_id, dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); if (vdev_obsolete_sm_object(vd) == 0) { uint64_t obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(vd->vdev_obsolete_sm); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); - space_map_update(vd->vdev_obsolete_sm); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa); } /* * Gets the obsolete spacemap object from the vdev's ZAP. * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't * exist yet. */ int vdev_obsolete_sm_object(vdev_t *vd) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { return (0); } uint64_t sm_obj = 0; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); ASSERT(err == 0 || err == ENOENT); return (sm_obj); } boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { return (B_FALSE); } uint64_t val = 0; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); ASSERT(err == 0 || err == ENOENT); return (val != 0); } /* ARGSUSED */ static void vdev_indirect_close(vdev_t *vd) { } /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *ashift = vd->vdev_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); bcopy(first_mapping, duplicate_mappings, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; ASSERT3P(vim, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_put(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * Note, we may read from a child whose DTL * indicates that the data may not be present here. * While this might result in a few i/os that will * likely return incorrect data, it simplifies the * code since we can treat scrub and resilver * identically. (The incorrect data will be * detected and ignored when we verify the * checksum.) */ ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * Read one copy of each split segment, from the * top-level vdev. Since we don't know the * checksum of each split individually, the child * zio can't ensure that we get the right data. * E.g. if it's a mirror, it will just read from a * random (healthy) leaf vdev. We have to verify * the checksum in vdev_indirect_io_done(). */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset(zio->io_abd, is->is_split_offset), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = { 0 }; void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); abd_t *good_abd = is->is_good_child->ic_data; void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); abd_return_buf(ic->ic_data, bad_buf, is->is_size); abd_return_buf(good_abd, good_buf, is->is_size); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. Note that we do this without regard for the DTL's, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; enum zio_flag flags = ZIO_FLAG_IO_REPAIR; if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) flags |= ZIO_FLAG_SELF_HEAL; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zfs_ereport_post_checksum(zio->io_spa, vd, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = spa_get_random(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ int error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 16 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, ic->ic_data->abd_size); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max > (1ULL << 16)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data, is->is_size) == 0) { ic_j->ic_duplicate = ic_i; } } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { vdev_indirect_open, vdev_indirect_close, vdev_default_asize, vdev_indirect_io_start, vdev_indirect_io_done, NULL, NULL, NULL, vdev_indirect_remap, NULL, VDEV_TYPE_INDIRECT, /* name of this vdev type */ B_FALSE /* leaf vdev */ }; Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c (revision 354383) @@ -1,592 +1,593 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2015, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include static boolean_t vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) { ASSERT(vim != NULL); ASSERT(vim->vim_object != 0); ASSERT(vim->vim_objset != NULL); ASSERT(vim->vim_phys != NULL); ASSERT(vim->vim_dbuf != NULL); EQUIV(vim->vim_phys->vimp_num_entries > 0, vim->vim_entries != NULL); if (vim->vim_phys->vimp_num_entries > 0) { vdev_indirect_mapping_entry_phys_t *last_entry = &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); } if (vim->vim_havecounts) { ASSERT(vim->vim_phys->vimp_counts_object != 0); } return (B_TRUE); } uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); return (vim->vim_phys->vimp_num_entries); } uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); return (vim->vim_phys->vimp_max_offset); } uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); return (vim->vim_object); } uint64_t vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); return (vim->vim_phys->vimp_bytes_mapped); } /* * The length (in bytes) of the mapping object array in memory and * (logically) on disk. * * Note that unlike most of our accessor functions, * we don't assert that the struct is consistent; therefore it can be * called while there may be concurrent changes, if we don't care about * the value being immediately stale (e.g. from spa_removal_get_stats()). */ uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) { return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); } /* * Compare an offset with an indirect mapping entry; there are three * possible scenarios: * * 1. The offset is "less than" the mapping entry; meaning the * offset is less than the source offset of the mapping entry. In * this case, there is no overlap between the offset and the * mapping entry and -1 will be returned. * * 2. The offset is "greater than" the mapping entry; meaning the * offset is greater than the mapping entry's source offset plus * the entry's size. In this case, there is no overlap between * the offset and the mapping entry and 1 will be returned. * * NOTE: If the offset is actually equal to the entry's offset * plus size, this is considered to be "greater" than the entry, * and this case applies (i.e. 1 will be returned). Thus, the * entry's "range" can be considered to be inclusive at its * start, but exclusive at its end: e.g. [src, src + size). * * 3. The last case to consider is if the offset actually falls * within the mapping entry's range. If this is the case, the * offset is considered to be "equal to" the mapping entry and * 0 will be returned. * * NOTE: If the offset is equal to the entry's source offset, * this case applies and 0 will be returned. If the offset is * equal to the entry's source plus its size, this case does * *not* apply (see "NOTE" above for scenario 2), and 1 will be * returned. */ static int dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) { const uint64_t *key = v_key; const vdev_indirect_mapping_entry_phys_t *array_elem = v_array_elem; uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); if (*key < src_offset) { return (-1); } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { return (0); } else { return (1); } } /* * Returns the mapping entry for the given offset. * * It's possible that the given offset will not be in the mapping table * (i.e. no mapping entries contain this offset), in which case, the * return value value depends on the "next_if_missing" parameter. * * If the offset is not found in the table and "next_if_missing" is * B_FALSE, then NULL will always be returned. The behavior is intended * to allow consumers to get the entry corresponding to the offset * parameter, iff the offset overlaps with an entry in the table. * * If the offset is not found in the table and "next_if_missing" is * B_TRUE, then the entry nearest to the given offset will be returned, * such that the entry's source offset is greater than the offset * passed in (i.e. the "next" mapping entry in the table is returned, if * the offset is missing from the table). If there are no entries whose * source offset is greater than the passed in offset, NULL is returned. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, uint64_t offset, boolean_t next_if_missing) { ASSERT(vdev_indirect_mapping_verify(vim)); ASSERT(vim->vim_phys->vimp_num_entries > 0); vdev_indirect_mapping_entry_phys_t *entry = NULL; uint64_t last = vim->vim_phys->vimp_num_entries - 1; uint64_t base = 0; /* * We don't define these inside of the while loop because we use * their value in the case that offset isn't in the mapping. */ uint64_t mid; int result; while (last >= base) { mid = base + ((last - base) >> 1); result = dva_mapping_overlap_compare(&offset, &vim->vim_entries[mid]); if (result == 0) { entry = &vim->vim_entries[mid]; break; } else if (result < 0) { last = mid - 1; } else { base = mid + 1; } } if (entry == NULL && next_if_missing) { ASSERT3U(base, ==, last + 1); ASSERT(mid == base || mid == last); ASSERT3S(result, !=, 0); /* * The offset we're looking for isn't actually contained * in the mapping table, thus we need to return the * closest mapping entry that is greater than the * offset. We reuse the result of the last comparison, * comparing the mapping entry at index "mid" and the * offset. The offset is guaranteed to lie between * indices one less than "mid", and one greater than * "mid"; we just need to determine if offset is greater * than, or less than the mapping entry contained at * index "mid". */ uint64_t index; if (result < 0) index = mid; else index = mid + 1; ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); if (index == vim->vim_phys->vimp_num_entries) { /* * If "index" is past the end of the entries * array, then not only is the offset not in the * mapping table, but it's actually greater than * all entries in the table. In this case, we * can't return a mapping entry greater than the * offset (since none exist), so we return NULL. */ ASSERT3S(dva_mapping_overlap_compare(&offset, &vim->vim_entries[index - 1]), >, 0); return (NULL); } else { /* * Just to be safe, we verify the offset falls * in between the mapping entries at index and * one less than index. Since we know the offset * doesn't overlap an entry, and we're supposed * to return the entry just greater than the * offset, both of the following tests must be * true. */ ASSERT3S(dva_mapping_overlap_compare(&offset, &vim->vim_entries[index]), <, 0); IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, &vim->vim_entries[index - 1]) > 0); return (&vim->vim_entries[index]); } } else { return (entry); } } vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, uint64_t offset) { return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, B_FALSE)); } vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, uint64_t offset) { return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, B_TRUE)); } void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); if (vim->vim_phys->vimp_num_entries > 0) { uint64_t map_size = vdev_indirect_mapping_size(vim); kmem_free(vim->vim_entries, map_size); vim->vim_entries = NULL; } dmu_buf_rele(vim->vim_dbuf, vim); vim->vim_objset = NULL; vim->vim_object = 0; vim->vim_dbuf = NULL; vim->vim_phys = NULL; kmem_free(vim, sizeof (*vim)); } uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) { uint64_t object; ASSERT(dmu_tx_is_syncing(tx)); uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { bonus_size = sizeof (vdev_indirect_mapping_phys_t); } object = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, bonus_size, tx); if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { dmu_buf_t *dbuf; vdev_indirect_mapping_phys_t *vimp; VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); vimp = dbuf->db_data; vimp->vimp_counts_object = dmu_object_alloc(os, DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); dmu_buf_rele(dbuf, FTAG); } return (object); } vdev_indirect_mapping_t * vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) { vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); dmu_object_info_t doi; VERIFY0(dmu_object_info(os, mapping_object, &doi)); vim->vim_objset = os; vim->vim_object = mapping_object; VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, &vim->vim_dbuf)); vim->vim_phys = vim->vim_dbuf->db_data; vim->vim_havecounts = (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); if (vim->vim_phys->vimp_num_entries > 0) { uint64_t map_size = vdev_indirect_mapping_size(vim); vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, vim->vim_entries, DMU_READ_PREFETCH)); } ASSERT(vdev_indirect_mapping_verify(vim)); return (vim); } void vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); if (vim->vim_havecounts) { VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, tx)); spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } vdev_indirect_mapping_close(vim); VERIFY0(dmu_object_free(os, object, tx)); } /* * Append the list of vdev_indirect_mapping_entry_t's to the on-disk * mapping object. Also remove the entries from the list and free them. * This also implicitly extends the max_offset of the mapping (to the end * of the last entry). */ void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, list_t *list, dmu_tx_t *tx) { vdev_indirect_mapping_entry_phys_t *mapbuf; uint64_t old_size; uint32_t *countbuf = NULL; vdev_indirect_mapping_entry_phys_t *old_entries; uint64_t old_count; uint64_t entries_written = 0; ASSERT(vdev_indirect_mapping_verify(vim)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); ASSERT(!list_is_empty(list)); old_size = vdev_indirect_mapping_size(vim); old_entries = vim->vim_entries; old_count = vim->vim_phys->vimp_num_entries; dmu_buf_will_dirty(vim->vim_dbuf, tx); mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); if (vim->vim_havecounts) { countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)); } while (!list_is_empty(list)) { uint64_t i; /* * Write entries from the list to the * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. */ for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { vdev_indirect_mapping_entry_t *entry = list_remove_head(list); if (entry == NULL) break; uint64_t size = DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); /* * We shouldn't be adding an entry which is fully * obsolete. */ ASSERT3U(entry->vime_obsolete_count, <, size); IMPLY(entry->vime_obsolete_count != 0, vim->vim_havecounts); mapbuf[i] = entry->vime_mapping; if (vim->vim_havecounts) countbuf[i] = entry->vime_obsolete_count; vim->vim_phys->vimp_bytes_mapped += size; ASSERT3U(src_offset, >=, vim->vim_phys->vimp_max_offset); vim->vim_phys->vimp_max_offset = src_offset + size; entries_written++; kmem_free(entry, sizeof (*entry)); } dmu_write(vim->vim_objset, vim->vim_object, vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), i * sizeof (*mapbuf), mapbuf, tx); if (vim->vim_havecounts) { dmu_write(vim->vim_objset, vim->vim_phys->vimp_counts_object, vim->vim_phys->vimp_num_entries * sizeof (*countbuf), i * sizeof (*countbuf), countbuf, tx); } vim->vim_phys->vimp_num_entries += i; } zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); if (vim->vim_havecounts) zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); /* * Update the entry array to reflect the new entries. First, copy * over any old entries then read back the new entries we just wrote. */ uint64_t new_size = vdev_indirect_mapping_size(vim); ASSERT3U(new_size, >, old_size); ASSERT3U(new_size - old_size, ==, entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); if (old_size > 0) { bcopy(old_entries, vim->vim_entries, old_size); kmem_free(old_entries, old_size); } VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, new_size - old_size, &vim->vim_entries[old_count], DMU_READ_PREFETCH)); zfs_dbgmsg("txg %llu: wrote %llu entries to " "indirect mapping obj %llu; max offset=0x%llx", (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)entries_written, (u_longlong_t)vim->vim_object, (u_longlong_t)vim->vim_phys->vimp_max_offset); } /* * Increment the relevant counts for the specified offset and length. * The counts array must be obtained from * vdev_indirect_mapping_load_obsolete_counts(). */ void vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, uint64_t offset, uint64_t length, uint32_t *counts) { vdev_indirect_mapping_entry_phys_t *mapping; uint64_t index; mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT(length > 0); ASSERT3P(mapping, !=, NULL); index = mapping - vim->vim_entries; while (length > 0) { ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(mapping); VERIFY3U(inner_offset, <, size); uint64_t inner_size = MIN(length, size - inner_offset); VERIFY3U(counts[index] + inner_size, <=, size); counts[index] += inner_size; offset += inner_size; length -= inner_size; mapping++; index++; } } typedef struct load_obsolete_space_map_arg { vdev_indirect_mapping_t *losma_vim; uint32_t *losma_counts; } load_obsolete_space_map_arg_t; static int load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) { load_obsolete_space_map_arg_t *losma = arg; ASSERT3S(sme->sme_type, ==, SM_ALLOC); vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, sme->sme_offset, sme->sme_run, losma->losma_counts); return (0); } /* * Modify the counts (increment them) based on the spacemap. */ void vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, uint32_t *counts, space_map_t *obsolete_space_sm) { load_obsolete_space_map_arg_t losma; losma.losma_counts = counts; losma.losma_vim = vim; VERIFY0(space_map_iterate(obsolete_space_sm, + space_map_length(obsolete_space_sm), load_obsolete_sm_callback, &losma)); } /* * Read the obsolete counts from disk, returning them in an array. */ uint32_t * vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) { ASSERT(vdev_indirect_mapping_verify(vim)); uint64_t counts_size = vim->vim_phys->vimp_num_entries * sizeof (uint32_t); uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); if (vim->vim_havecounts) { VERIFY0(dmu_read(vim->vim_objset, vim->vim_phys->vimp_counts_object, 0, counts_size, counts, DMU_READ_PREFETCH)); } else { bzero(counts, counts_size); } return (counts); } extern void vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, uint32_t *counts) { ASSERT(vdev_indirect_mapping_verify(vim)); kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_initialize.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_initialize.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_initialize.c (revision 354383) @@ -1,781 +1,781 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Maximum number of metaslabs per group that can be initialized * simultaneously. */ int max_initialize_ms = 3; /* * Value that is written to disk during initialization. */ uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; /* maximum number of I/Os outstanding per leaf vdev */ int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ uint64_t zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || vd->vdev_detached || vd->vdev_top->vdev_removing); } static void vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) { /* * We pass in the guid instead of the vdev_t since the vdev may * have been freed prior to the sync task being processed. This * happens when a vdev is detached as we call spa_config_vdev_exit(), * stop the intializing thread, schedule the sync task, and free * the vdev. Later when the scheduled sync task is invoked, it would * find that the vdev has been freed. */ uint64_t guid = *(uint64_t *)arg; uint64_t txg = dmu_tx_get_txg(tx); kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; vd->vdev_initialize_offset[txg & TXG_MASK] = 0; VERIFY(vd->vdev_leaf_zap != 0); objset_t *mos = vd->vdev_spa->spa_meta_objset; if (last_offset > 0) { vd->vdev_initialize_last_offset = last_offset; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (last_offset), 1, &last_offset, tx)); } if (vd->vdev_initialize_action_time > 0) { uint64_t val = (uint64_t)vd->vdev_initialize_action_time; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 1, &val, tx)); } uint64_t initialize_state = vd->vdev_initialize_state; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state, tx)); } static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); spa_t *spa = vd->vdev_spa; if (new_state == vd->vdev_initialize_state) return; /* * Copy the vd's guid, this will be freed by the sync task. */ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* * If we're suspending, then preserving the original start time. */ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); switch (new_state) { case VDEV_INITIALIZE_ACTIVE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s activated", vd->vdev_path); break; case VDEV_INITIALIZE_SUSPENDED: spa_history_log_internal(spa, "initialize", tx, "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: spa_history_log_internal(spa, "initialize", tx, "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; default: panic("invalid state %llu", (unsigned long long)new_state); } dmu_tx_commit(tx); } static void vdev_initialize_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_initialize_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the vdev was unavailable; roll the * last offset back. (This works because spa_sync waits on * spa_txg_zio before it runs sync tasks.) */ uint64_t *off = &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else { /* * Since initializing is best-effort, we ignore I/O errors and * rely on vdev_probe to determine if the errors are more * critical. */ if (zio->io_error != 0) vd->vdev_stat.vs_initialize_errors++; vd->vdev_initialize_bytes_done += zio->io_orig_size; } ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; cv_broadcast(&vd->vdev_initialize_io_cv); mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* Takes care of physical writing and limiting # of concurrent ZIOs. */ static int vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) { spa_t *spa = vd->vdev_spa; /* Limit inflight initializing I/Os */ mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } vd->vdev_initialize_inflight++; mutex_exit(&vd->vdev_initialize_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_initialize_lock); if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); } /* * We know the vdev struct will still be around since all * consumers of vdev_free must stop the initialization first. */ if (vdev_initialize_should_stop(vd)) { mutex_enter(&vd->vdev_initialize_io_lock); ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_initialize_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_initialize_lock); vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); /* vdev_initialize_cb releases SCL_STATE_ALL */ dmu_tx_commit(tx); return (0); } /* * Translate a logical range to the physical range for the specified vdev_t. * This function is initially called with a leaf vdev and will walk each * parent vdev until it reaches a top-level vdev. Once the top-level is * reached the physical range is initialized and the recursive function * begins to unwind. As it unwinds it calls the parent's vdev specific * translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); } else { /* * We've reached the top-level vdev, initialize the * physical range to the logical range and start to * unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical components by calling the * vdev specific translate function. */ range_seg_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } /* * Callback to fill each ABD chunk with zfs_initialize_value. len must be * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD * allocation will guarantee these for us. */ /* ARGSUSED */ static int vdev_initialize_block_fill(void *buf, size_t len, void *unused) { ASSERT0(len % sizeof (uint64_t)); for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; } return (0); } static abd_t * vdev_initialize_block_alloc() { /* Allocate ABD for filler data */ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, vdev_initialize_block_fill, NULL); return (data); } static void vdev_initialize_block_free(abd_t *data) { abd_free(data); } static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; for (range_seg_t *rs = avl_first(rt); rs != NULL; rs = AVL_NEXT(rt, rs)) { uint64_t size = rs->rs_end - rs->rs_start; /* Split range into legally-sized physical chunks */ uint64_t writes_required = ((size - 1) / zfs_initialize_chunk_size) + 1; for (uint64_t w = 0; w < writes_required; w++) { int error; error = vdev_initialize_write(vd, VDEV_LABEL_START_SIZE + rs->rs_start + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); if (error != 0) return (error); } } return (0); } static void vdev_initialize_mg_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); while (mg->mg_initialize_updating) { cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); } } static void vdev_initialize_mg_mark(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); ASSERT(mg->mg_initialize_updating); while (mg->mg_ms_initializing >= max_initialize_ms) { cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); } mg->mg_ms_initializing++; ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); } /* * Mark the metaslab as being initialized to prevent any allocations * on this metaslab. We must also track how many metaslabs are currently * being initialized within a metaslab group and limit them to prevent * allocation failures from occurring because all metaslabs are being * initialized. */ static void vdev_initialize_ms_mark(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_initialize_lock); /* * To keep an accurate count of how many threads are initializing * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_initializing will be accurate when we decide to mark a metaslab * group as being initialized. To do this we force all other threads * to wait till the metaslab's mg_initialize_updating flag is no * longer set. */ vdev_initialize_mg_wait(mg); mg->mg_initialize_updating = B_TRUE; if (msp->ms_initializing == 0) { vdev_initialize_mg_mark(mg); } mutex_enter(&msp->ms_lock); msp->ms_initializing++; mutex_exit(&msp->ms_lock); mg->mg_initialize_updating = B_FALSE; cv_broadcast(&mg->mg_ms_initialize_cv); mutex_exit(&mg->mg_ms_initialize_lock); } static void vdev_initialize_ms_unmark(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_initialize_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_initializing == 0) { mg->mg_ms_initializing--; cv_broadcast(&mg->mg_ms_initialize_cv); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_initialize_lock); } static void vdev_initialize_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); vd->vdev_initialize_bytes_est = 0; vd->vdev_initialize_bytes_done = 0; for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); uint64_t ms_free = msp->ms_size - - space_map_allocated(msp->ms_sm); + metaslab_allocated_space(msp); if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) ms_free /= vd->vdev_top->vdev_children; /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ range_seg_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } else if (vd->vdev_initialize_last_offset > physical_rs.rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* * If we get here, we're in the middle of initializing this * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ VERIFY0(metaslab_load(msp)); for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { logical_rs.rs_start = rs->rs_start; logical_rs.rs_end = rs->rs_end; vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - physical_rs.rs_start; vd->vdev_initialize_bytes_est += size; if (vd->vdev_initialize_last_offset > physical_rs.rs_end) { vd->vdev_initialize_bytes_done += size; } else if (vd->vdev_initialize_last_offset > physical_rs.rs_start && vd->vdev_initialize_last_offset < physical_rs.rs_end) { vd->vdev_initialize_bytes_done += vd->vdev_initialize_last_offset - physical_rs.rs_start; } } mutex_exit(&msp->ms_lock); } } static void vdev_initialize_load(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (vd->vdev_initialize_last_offset), 1, &vd->vdev_initialize_last_offset); ASSERT(err == 0 || err == ENOENT); } vdev_initialize_calculate_progress(vd); } /* * Convert the logical range into a physcial range and add it to our * avl tree. */ void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; range_seg_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate(vd, &logical_rs, &physical_rs); IMPLY(vd->vdev_top == vd, logical_rs.rs_start == physical_rs.rs_start); IMPLY(vd->vdev_top == vd, logical_rs.rs_end == physical_rs.rs_end); /* Only add segments that we have not visited yet */ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, (u_longlong_t)physical_rs.rs_start, (u_longlong_t)physical_rs.rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, (u_longlong_t)physical_rs.rs_end); ASSERT3U(physical_rs.rs_end, >, vd->vdev_initialize_last_offset); physical_rs.rs_start = vd->vdev_initialize_last_offset; } ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); /* * With raidz, it's possible that the logical range does not live on * this leaf vdev. We only add the physical range to this vdev's if it * has a length greater than 0. */ if (physical_rs.rs_end > physical_rs.rs_start) { range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); } } static void vdev_initialize_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int error = 0; uint64_t ms_count = 0; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_initialize_last_offset = 0; vdev_initialize_load(vd); abd_t *deadbeef = vdev_initialize_block_alloc(); vd->vdev_initialize_tree = range_tree_create(NULL, NULL); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; /* * If we've expanded the top-level vdev or it's our * first pass, calculate our progress. */ if (vd->vdev_top->vdev_ms_count != ms_count) { vdev_initialize_calculate_progress(vd); ms_count = vd->vdev_top->vdev_ms_count; } vdev_initialize_ms_mark(msp); mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd); mutex_exit(&msp->ms_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); error = vdev_initialize_ranges(vd, deadbeef); vdev_initialize_ms_unmark(msp); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight > 0) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } mutex_exit(&vd->vdev_initialize_io_lock); range_tree_destroy(vd->vdev_initialize_tree); vdev_initialize_block_free(deadbeef); vd->vdev_initialize_tree = NULL; mutex_enter(&vd->vdev_initialize_lock); if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); } ASSERT(vd->vdev_initialize_thread != NULL || vd->vdev_initialize_inflight == 0); /* * Drop the vdev_initialize_lock while we sync out the * txg since it's possible that a device might be trying to * come online and must check to see if it needs to restart an * initialization. That thread will be holding the spa_config_lock * which would prevent the txg_wait_synced from completing. */ mutex_exit(&vd->vdev_initialize_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&vd->vdev_initialize_lock); vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); } /* * Initiates a device. Caller must hold vdev_initialize_lock. * Device must be a leaf and not already be initializing. */ void vdev_initialize(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } /* * Stop initializng a device, with the resultant initialing state being * tgt_state. Blocks until the initializing thread has exited. * Caller must hold vdev_initialize_lock and must not be writing to the spa * config, as the initializing thread may try to enter the config as a reader * before exiting. */ void vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) { spa_t *spa = vd->vdev_spa; ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); /* * Allow cancel requests to proceed even if the initialize thread * has stopped. */ if (vd->vdev_initialize_thread == NULL && tgt_state != VDEV_INITIALIZE_CANCELED) { return; } vdev_initialize_change_state(vd, tgt_state); vd->vdev_initialize_exit_wanted = B_TRUE; while (vd->vdev_initialize_thread != NULL) cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); vd->vdev_initialize_exit_wanted = B_FALSE; } static void vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, tgt_state); mutex_exit(&vd->vdev_initialize_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); } } /* * Convenience function to stop initializing of a vdev tree and set all * initialize thread pointers to NULL. */ void vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) { vdev_initialize_stop_all_impl(vd, tgt_state); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } } void vdev_initialize_restart(vdev_t *vd) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { mutex_enter(&vd->vdev_initialize_lock); uint64_t initialize_state = VDEV_INITIALIZE_NONE; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_state = initialize_state; uint64_t timestamp = 0; err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = (time_t)timestamp; if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || vd->vdev_offline) { /* load progress for reporting, but don't resume */ vdev_initialize_load(vd); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_restart(vd->vdev_child[i]); } } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c (revision 354382) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c (revision 354383) @@ -1,2185 +1,2155 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the necessary logic to remove vdevs from a * storage pool. Currently, the only devices that can be removed * are log, cache, and spare devices; and top level vdevs from a pool * w/o raidz. (Note that members of a mirror can also be removed * by the detach operation.) * * Log vdevs are removed by evacuating them and then turning the vdev * into a hole vdev while holding spa config locks. * * Top level vdevs are removed and converted into an indirect vdev via * a multi-step process: * * - Disable allocations from this device (spa_vdev_remove_top). * * - From a new thread (spa_vdev_remove_thread), copy data from * the removing vdev to a different vdev. The copy happens in open * context (spa_vdev_copy_impl) and issues a sync task * (vdev_mapping_sync) so the sync thread can update the partial * indirect mappings in core and on disk. * * - If a free happens during a removal, it is freed from the * removing vdev, and if it has already been copied, from the new * location as well (free_from_removing_vdev). * * - After the removal is completed, the copy thread converts the vdev * into an indirect vdev (vdev_remove_complete) before instructing * the sync thread to destroy the space maps and finish the removal * (spa_finish_removal). */ typedef struct vdev_copy_arg { metaslab_t *vca_msp; uint64_t vca_outstanding_bytes; kcondvar_t vca_cv; kmutex_t vca_lock; } vdev_copy_arg_t; /* * The maximum amount of memory we can use for outstanding i/o while * doing a device removal. This determines how much i/o we can have * in flight concurrently. */ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. * * Note: we will issue I/Os of up to this size. The mpt driver does not * respond well to I/Os larger than 1MB, so we set this to 1MB. (When * mpt processes an I/O larger than 1MB, it needs to do an allocation of * 2 physically contiguous pages; if this allocation fails, mpt will drop * the I/O and hang the device.) */ int zfs_remove_max_segment = 1024 * 1024; /* * Allow a remap segment to span free chunks of at most this size. The main * impact of a larger span is that we will read and write larger, more * contiguous chunks, with more "unnecessary" data -- trading off bandwidth * for iops. The value here was chosen to align with * zfs_vdev_read_gap_limit, which is a similar concept when doing regular * reads (but there's no reason it has to be the same). * * Additionally, a higher span will have the following relatively minor * effects: * - the mapping will be smaller, since one entry can cover more allocated * segments * - more of the fragmentation in the removing device will be preserved * - we'll do larger allocations, which may fail and fall back on smaller * allocations */ int vdev_removal_max_span = 32 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ uint64_t zfs_remove_max_bytes_pause = UINT64_MAX; #define VDEV_REMOVAL_ZAP_OBJS "lzap" static void spa_vdev_remove_thread(void *arg); static void spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) { VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys, tx)); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid = fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } static spa_vdev_removal_t * spa_vdev_removal_create(vdev_t *vd) { spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = range_tree_create(NULL, NULL); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = range_tree_create(NULL, NULL); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } return (svr); } void spa_vdev_removal_destroy(spa_vdev_removal_t *svr) { for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); ASSERT0(svr->svr_max_offset_to_sync[i]); range_tree_destroy(svr->svr_frees[i]); list_destroy(&svr->svr_new_segments[i]); } range_tree_destroy(svr->svr_allocd_segs); mutex_destroy(&svr->svr_lock); cv_destroy(&svr->svr_cv); kmem_free(svr, sizeof (*svr)); } /* * This is called as a synctask in the txg in which we will mark this vdev * as removing (in the config stored in the MOS). * * It begins the evacuation of a toplevel vdev by: * - initializing the spa_removing_phys which tracks this removal * - computing the amount of space to remove for accounting purposes * - dirtying all dbufs in the spa_config_object * - creating the spa_vdev_removal * - starting the spa_vdev_remove_thread */ static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; uint64_t txg = dmu_tx_get_txg(tx); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { /* * By activating the OBSOLETE_COUNTS feature, we prevent * the pool from being downgraded and ensure that the * refcounts are precise. */ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); uint64_t one = 1; VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, &one, tx)); ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); } vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); vd->vdev_indirect_mapping = vdev_indirect_mapping_open(mos, vic->vic_mapping_object); vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); vd->vdev_indirect_births = vdev_indirect_births_open(mos, vic->vic_births_object); spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; spa->spa_removing_phys.sr_start_time = gethrestime_sec(); spa->spa_removing_phys.sr_end_time = 0; spa->spa_removing_phys.sr_state = DSS_SCANNING; spa->spa_removing_phys.sr_to_copy = 0; spa->spa_removing_phys.sr_copied = 0; /* * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because * there may be space in the defer tree, which is free, but still * counted in vs_alloc. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *ms = vd->vdev_ms[i]; if (ms->ms_sm == NULL) continue; - /* - * Sync tasks happen before metaslab_sync(), therefore - * smp_alloc and sm_alloc must be the same. - */ - ASSERT3U(space_map_allocated(ms->ms_sm), ==, - ms->ms_sm->sm_phys->smp_alloc); - spa->spa_removing_phys.sr_to_copy += - space_map_allocated(ms->ms_sm); + metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to * be copied. */ spa->spa_removing_phys.sr_to_copy -= range_tree_space(ms->ms_freeing); ASSERT0(range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) ASSERT0(range_tree_space(ms->ms_allocating[t])); } /* * Sync tasks are called before metaslab_sync(), so there should * be no already-synced metaslabs in the TXG_CLEAN list. */ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); spa_sync_removing_state(spa, tx); /* * All blocks that we need to read the most recent mapping must be * stored on concrete vdevs. Therefore, we must dirty anything that * is read before spa_remove_init(). Specifically, the * spa_config_object. (Note that although we already modified the * spa_config_object in spa_sync_removing_state, that may not have * modified all blocks of the object.) */ dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { dmu_buf_t *dbuf; VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, offset, FTAG, &dbuf, 0)); dmu_buf_will_dirty(dbuf, tx); offset += dbuf->db_size; dmu_buf_rele(dbuf, FTAG); } /* * Now that we've allocated the im_object, dirty the vdev to ensure * that the object gets written to the config on disk. */ vdev_config_dirty(vd); zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), vic->vic_mapping_object); spa_history_log_internal(spa, "vdev remove started", tx, "%s vdev %llu %s", spa_name(spa), vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* * Setting spa_vdev_removal causes subsequent frees to call * free_from_removing_vdev(). Note that we don't need any locking * because we are the sync thread, and metaslab_free_impl() is only * called from syncing context (potentially from a zio taskq thread, * but in any case only when there are outstanding free i/os, which * there are not). */ ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * When we are opening a pool, we must read the mapping for each * indirect vdev in order from most recently removed to least * recently removed. We do this because the blocks for the mapping * of older indirect vdevs may be stored on more recently removed vdevs. * In order to read each indirect mapping object, we must have * initialized all more recently removed vdevs. */ int spa_remove_init(spa_t *spa) { int error; error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys); if (error == ENOENT) { spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } else if (error != 0) { return (error); } if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { /* * We are currently removing a vdev. Create and * initialize a spa_vdev_removal_t from the bonus * buffer of the removing vdevs vdev_im_object, and * initialize its partial mapping. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); if (vd == NULL) { spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != UINT64_MAX) { vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); indirect_vdev_id = vic->vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_STATE, FTAG); /* * Now that we've loaded all the indirect mappings, we can allow * reads from other blocks (e.g. via predictive prefetch). */ spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } void spa_restart_removal(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; /* * In general when this function is called there is no * removal thread running. The only scenario where this * is not true is during spa_import() where this function * is called twice [once from spa_import_impl() and * spa_async_resume()]. Thus, in the scenario where we * import a pool that has an ongoing removal we don't * want to spawn a second thread. */ if (svr->svr_thread != NULL) return; if (!spa_writeable(spa)) return; zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * Process freeing from a device which is in the middle of being removed. * We must handle this carefully so that we attempt to copy freed data, * and we correctly free already-copied data. */ void free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t txg = spa_syncing_txg(spa); uint64_t max_offset_yet = 0; ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); mutex_enter(&svr->svr_lock); /* * Remove the segment from the removing vdev's spacemap. This * ensures that we will not attempt to copy this space (if the * removal thread has not yet visited it), and also ensures * that we know what is actually allocated on the new vdevs * (needed if we cancel the removal). * * Note: we must do the metaslab_free_concrete() with the svr_lock * held, so that the remove_thread can not load this metaslab and then * visit this offset between the time that we metaslab_free_concrete() * and when we check to see if it has been visited. * * Note: The checkpoint flag is set to false as having/taking * a checkpoint and removing a device can't happen at the same * time. */ ASSERT(!spa_has_checkpoint(spa)); metaslab_free_concrete(vd, offset, size, B_FALSE); uint64_t synced_size = 0; uint64_t synced_offset = 0; uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); if (offset < max_offset_synced) { /* * The mapping for this offset is already on disk. * Free from the new location. * * Note that we use svr_max_synced_offset because it is * updated atomically with respect to the in-core mapping. * By contrast, vim_max_offset is not. * * This block may be split between a synced entry and an * in-flight or unvisited entry. Only process the synced * portion of it here. */ synced_size = MIN(size, max_offset_synced - offset); synced_offset = offset; ASSERT3U(max_offset_yet, <=, max_offset_synced); max_offset_yet = max_offset_synced; DTRACE_PROBE3(remove__free__synced, spa_t *, spa, uint64_t, offset, uint64_t, synced_size); size -= synced_size; offset += synced_size; } /* * Look at all in-flight txgs starting from the currently syncing one * and see if a section of this free is being copied. By starting from * this txg and iterating forward, we might find that this region * was copied in two different txgs and handle it appropriately. */ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { int txgoff = (txg + i) & TXG_MASK; if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { /* * The mapping for this offset is in flight, and * will be synced in txg+i. */ uint64_t inflight_size = MIN(size, svr->svr_max_offset_to_sync[txgoff] - offset); DTRACE_PROBE4(remove__free__inflight, spa_t *, spa, uint64_t, offset, uint64_t, inflight_size, uint64_t, txg + i); /* * We copy data in order of increasing offset. * Therefore the max_offset_to_sync[] must increase * (or be zero, indicating that nothing is being * copied in that txg). */ if (svr->svr_max_offset_to_sync[txgoff] != 0) { ASSERT3U(svr->svr_max_offset_to_sync[txgoff], >=, max_offset_yet); max_offset_yet = svr->svr_max_offset_to_sync[txgoff]; } /* * We've already committed to copying this segment: * we have allocated space elsewhere in the pool for * it and have an IO outstanding to copy the data. We * cannot free the space before the copy has * completed, or else the copy IO might overwrite any * new data. To free that space, we record the * segment in the appropriate svr_frees tree and free * the mapped space later, in the txg where we have * completed the copy and synced the mapping (see * vdev_mapping_sync). */ range_tree_add(svr->svr_frees[txgoff], offset, inflight_size); size -= inflight_size; offset += inflight_size; /* * This space is already accounted for as being * done, because it is being copied in txg+i. * However, if i!=0, then it is being copied in * a future txg. If we crash after this txg * syncs but before txg+i syncs, then the space * will be free. Therefore we must account * for the space being done in *this* txg * (when it is freed) rather than the future txg * (when it will be copied). */ ASSERT3U(svr->svr_bytes_done[txgoff], >=, inflight_size); svr->svr_bytes_done[txgoff] -= inflight_size; svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; } } ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); if (size > 0) { /* * The copy thread has not yet visited this offset. Ensure * that it doesn't. */ DTRACE_PROBE3(remove__free__unvisited, spa_t *, spa, uint64_t, offset, uint64_t, size); if (svr->svr_allocd_segs != NULL) range_tree_clear(svr->svr_allocd_segs, offset, size); /* * Since we now do not need to copy this data, for * accounting purposes we have done our job and can count * it as completed. */ svr->svr_bytes_done[txg & TXG_MASK] += size; } mutex_exit(&svr->svr_lock); /* * Now that we have dropped svr_lock, process the synced portion * of this free. */ if (synced_size > 0) { vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); /* * Note: this can only be called from syncing context, * and the vdev_indirect_mapping is only changed from the * sync thread, so we don't need svr_lock while doing * metaslab_free_impl_cb. */ boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, metaslab_free_impl_cb, &checkpoint); } } /* * Stop an active removal and update the spa_removing phys. */ static void spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); /* Ensure the removal thread has completed before we free the svr. */ spa_vdev_remove_suspend(spa); ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != UINT64_MAX) { vdev_t *pvd = vdev_lookup_top(spa, srp->sr_prev_indirect_vdev); ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); } vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; srp->sr_prev_indirect_vdev = vd->vdev_id; } spa->spa_removing_phys.sr_state = state; spa->spa_removing_phys.sr_end_time = gethrestime_sec(); spa->spa_vdev_removal = NULL; spa_vdev_removal_destroy(svr); spa_sync_removing_state(spa, tx); vdev_config_dirty(spa->spa_root_vdev); } static void free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_mark_obsolete(vd, offset, size); boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } /* * On behalf of the removal thread, syncs an incremental bit more of * the indirect mapping to disk and updates the in-memory mapping. * Called as a sync task in every txg that the removal thread makes progress. */ static void vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT(vic->vic_mapping_object != 0); ASSERT3U(txg, ==, spa_syncing_txg(spa)); vdev_indirect_mapping_add_entries(vim, &svr->svr_new_segments[txg & TXG_MASK], tx); vdev_indirect_births_add_entry(vd->vdev_indirect_births, vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); /* * Free the copied data for anything that was freed while the * mapping entries were in flight. */ mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_frees[txg & TXG_MASK], free_mapped_segment_cb, vd); ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, vdev_indirect_mapping_max_offset(vim)); svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; mutex_exit(&svr->svr_lock); spa_sync_removing_state(spa, tx); } typedef struct vdev_copy_segment_arg { spa_t *vcsa_spa; dva_t *vcsa_dest_dva; uint64_t vcsa_txg; range_tree_t *vcsa_obsolete_segs; } vdev_copy_segment_arg_t; static void unalloc_seg(void *arg, uint64_t start, uint64_t size) { vdev_copy_segment_arg_t *vcsa = arg; spa_t *spa = vcsa->vcsa_spa; blkptr_t bp = { 0 }; BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(&bp, size); BP_SET_PSIZE(&bp, size); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(&bp, DMU_OT_NONE); BP_SET_LEVEL(&bp, 0); BP_SET_DEDUP(&bp, 0); BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); DVA_SET_OFFSET(&bp.blk_dva[0], DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); DVA_SET_ASIZE(&bp.blk_dva[0], size); zio_free(spa, vcsa->vcsa_txg, &bp); } /* * All reads and writes associated with a call to spa_vdev_copy_segment() * are done. */ static void spa_vdev_copy_segment_done(zio_t *zio) { vdev_copy_segment_arg_t *vcsa = zio->io_private; range_tree_vacate(vcsa->vcsa_obsolete_segs, unalloc_seg, vcsa); range_tree_destroy(vcsa->vcsa_obsolete_segs); kmem_free(vcsa, sizeof (*vcsa)); spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); } /* * The write of the new location is done. */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); } /* * The read of the old location is done. The parent zio is the write to * the new location. Allow it to start. */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { zio_nowait(zio_unique_parent(zio)); } /* * If the old and new vdevs are mirrors, we will read both sides of the old * mirror, and write each copy to the corresponding side of the new mirror. * If the old and new vdevs have a different number of children, we will do * this as best as possible. Since we aren't verifying checksums, this * ensures that as long as there's a good copy of the data, we'll have a * good copy after the removal, even if there's silent damage to one side * of the mirror. If we're removing a mirror that has some silent damage, * we'll have exactly the same damage in the new location (assuming that * the new location is also a mirror). * * We accomplish this by creating a tree of zio_t's, with as many writes as * there are "children" of the new vdev (a non-redundant vdev counts as one * child, a 2-way mirror has 2 children, etc). Each write has an associated * read from a child of the old vdev. Typically there will be the same * number of children of the old and new vdevs. However, if there are more * children of the new vdev, some child(ren) of the old vdev will be issued * multiple reads. If there are more children of the old vdev, some copies * will be dropped. * * For example, the tree of zio_t's for a 2-way mirror is: * * null * / \ * write(new vdev, child 0) write(new vdev, child 1) * | | * read(old vdev, child 0) read(old vdev, child 1) * * Child zio's complete before their parents complete. However, zio's * created with zio_vdev_child_io() may be issued before their children * complete. In this case we need to make sure that the children (reads) * complete before the parents (writes) are *issued*. We do this by not * calling zio_nowait() on each write until its corresponding read has * completed. * * The spa_config_lock must be held while zio's created by * zio_vdev_child_io() are in progress, to ensure that the vdev tree does * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" * zio is needed to release the spa_config_lock after all the reads and * writes complete. (Note that we can't grab the config lock for each read, * because it is not reentrant - we could deadlock with a thread waiting * for a write lock.) */ static void spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, vdev_t *source_vd, uint64_t source_offset, vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) { ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes += size; mutex_exit(&vca->vca_lock); abd_t *abd = abd_alloc_for_io(size, B_FALSE); vdev_t *source_child_vd; if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { /* * Source and dest are both mirrors. Copy from the same * child id as we are copying to (wrapping around if there * are more dest children than source children). */ source_child_vd = source_vd->vdev_child[dest_id % source_vd->vdev_children]; } else { source_child_vd = source_vd; } zio_t *write_zio = zio_vdev_child_io(nzio, NULL, dest_child_vd, dest_offset, abd, size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_write_done, vca); zio_nowait(zio_vdev_child_io(write_zio, NULL, source_child_vd, source_offset, abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_read_done, vca)); } /* * Allocate a new location for this segment, and create the zio_t's to * read from the old location and write to the new location. */ static int spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = { 0 }; uint64_t start = range_tree_min(segs); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { /* * We can't allocate all the segments. Prefer to end * the allocation at the end of a segment, thus avoiding * additional split blocks. */ range_seg_t search; avl_index_t where; search.rs_start = start + maxalloc; search.rs_end = search.rs_start; range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); if (rs == NULL) { rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); } else { rs = AVL_PREV(&segs->rt_root, rs); } if (rs != NULL) { size = rs->rs_end - start; } else { /* * There are no segments that end before maxalloc. * I.e. the first segment is larger than maxalloc, * so we must split it. */ size = maxalloc; } } ASSERT3U(size, <=, maxalloc); /* * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, &dst, 0, NULL, txg, 0, zal, 0); } if (error != 0) return (error); /* * Determine the ranges that are not actually needed. Offsets are * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); range_seg_t *rs = avl_first(&segs->rt_root); ASSERT3U(rs->rs_start, ==, start); uint64_t prev_seg_end = rs->rs_end; while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { if (rs->rs_start >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, rs->rs_start - prev_seg_end); } prev_seg_end = rs->rs_end; } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); range_tree_clear(segs, start, size); /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. * The caller ensures this will be true by passing in a size that is * aligned to the worst (highest) ashift in the pool. */ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { entry->vime_obsolete_count = range_tree_space(obsolete_segs); } vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; vcsa->vcsa_obsolete_segs = obsolete_segs; vcsa->vcsa_spa = spa; vcsa->vcsa_txg = txg; /* * See comment before spa_vdev_copy_one_child(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, spa_vdev_copy_segment_done, vcsa, 0); vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); if (dest_vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < dest_vd->vdev_children; i++) { vdev_t *child = dest_vd->vdev_child[i]; spa_vdev_copy_one_child(vca, nzio, vd, start, child, DVA_GET_OFFSET(&dst), i, size); } } else { spa_vdev_copy_one_child(vca, nzio, vd, start, dest_vd, DVA_GET_OFFSET(&dst), -1, size); } zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); vdev_dirty(vd, 0, NULL, txg); return (0); } /* * Complete the removal of a toplevel vdev. This is called as a * synctask in the same txg that we will sync out the new config (to the * MOS object) which indicates that this vdev is indirect. */ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); } ASSERT3U(spa->spa_removing_phys.sr_copied, ==, spa->spa_removing_phys.sr_to_copy); vdev_destroy_spacemaps(vd, tx); /* destroy leaf zaps, if any */ ASSERT3P(svr->svr_zaplist, !=, NULL); for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); pair != NULL; pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); } fnvlist_free(svr->svr_zaplist); spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); /* vd->vdev_path is not available here */ spa_history_log_internal(spa, "vdev remove completed", tx, "%s vdev %llu", spa_name(spa), vd->vdev_id); } static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); if (vd->vdev_leaf_zap != 0) { char zkey[32]; (void) snprintf(zkey, sizeof (zkey), "%s-%"PRIu64, VDEV_REMOVAL_ZAP_OBJS, vd->vdev_leaf_zap); fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); } for (uint64_t id = 0; id < vd->vdev_children; id++) { vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); } } static void vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) { vdev_t *ivd; dmu_tx_t *tx; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; /* * First, build a list of leaf zaps to be destroyed. * This is passed to the sync context thread, * which does the actual unlinking. */ svr->svr_zaplist = fnvlist_alloc(); vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); /* * Indicate that this thread has exited. * After this, we can not use svr. */ mutex_enter(&svr->svr_lock); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); } /* * Complete the removal of a toplevel vdev. This is called in open * context by the removal thread after we have copied all vdev's data. */ static void vdev_remove_complete(spa_t *spa) { uint64_t txg; /* * Wait for any deferred frees to be synced before we call * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", vd->vdev_id, txg); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); vdev_remove_replace_with_indirect(vd, txg); /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. * * Note that we hold on to the vdev_t that has been replaced. Since * it isn't part of the vdev tree any longer, it can't be concurrently * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); /* * Top ZAP should have been transferred to the indirect vdev in * vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_top_zap); /* * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_leaf_zap); txg = spa_vdev_enter(spa); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Request to update the config and the config cachefile. */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); spa_event_post(ev); } /* * Evacuates a segment of size at most max_alloc from the vdev * via repeated calls to spa_vdev_copy_segment. If an allocation * fails, the pool is probably too fragmented to handle such a * large size, so decrease max_alloc so that the caller will not try * this size again this txg. */ static void spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = dmu_tx_pool(tx)->dp_spa; mutex_enter(&svr->svr_lock); /* * Determine how big of a chunk to copy. We can allocate up * to max_alloc bytes, and we can span up to vdev_removal_max_span * bytes of unallocated space at a time. "segs" will track the * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ range_tree_t *segs = range_tree_create(NULL, NULL); for (;;) { range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); if (rs == NULL) break; uint64_t seg_length; if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ seg_length = MIN(rs->rs_end - rs->rs_start, *max_alloc); } else { if (rs->rs_start - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; } else if (rs->rs_end - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past * max_alloc. Rather than splitting this * segment, leave it for the next mapping. */ break; } else { seg_length = rs->rs_end - rs->rs_start; } } range_tree_add(segs, rs->rs_start, seg_length); range_tree_remove(svr->svr_allocd_segs, rs->rs_start, seg_length); } if (range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); range_tree_destroy(segs); return; } if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, svr, 0, ZFS_SPACE_CHECK_NONE, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); uint64_t thismax = SPA_MAXBLOCKSIZE; while (!range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* * Cut our segment in half, and don't try this * segment size again this txg. Note that the * allocation size must be aligned to the highest * ashift in the pool, so that the allocation will * not be padded out to a multiple of the ashift, * which could cause us to think that this mapping * is larger than we intended. */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); uint64_t attempted = MIN(range_tree_span(segs), thismax); thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); /* * The minimum-size allocation can not fail. */ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); /* * We've performed an allocation, so reset the * alloc trace list. */ metaslab_trace_fini(&zal); metaslab_trace_init(&zal); } } metaslab_trace_fini(&zal); range_tree_destroy(segs); } /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. * For each contiguous segment of allocated space (capping the segment * size at SPA_MAXBLOCKSIZE), we: * - Allocate space for it on another vdev. * - Create a new mapping from the old location to the new location * (as a record in svr_new_segments). * - Initiate a logical read zio to get the data off the removing disk. * - In the read zio's done callback, initiate a logical write zio to * write it to the new vdev. * Note that all of this will take effect when a particular TXG syncs. * The sync thread ensures that all the phys reads and writes for the syncing * TXG have completed (see spa_txg_zio) and writes the new mappings to disk * (see vdev_mapping_sync()). */ static void spa_vdev_remove_thread(void *arg) { spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = zfs_remove_max_segment; uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); vca.vca_outstanding_bytes = 0; mutex_enter(&svr->svr_lock); /* * Start from vim_max_offset so we pick up where we left off * if we are restarting the removal after opening the pool. */ uint64_t msi; for (msi = start_offset >> vd->vdev_ms_shift; msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT3U(msi, <=, vd->vdev_ms_count); ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(range_tree_space(msp->ms_allocating[i])); } /* * If the metaslab has ever been allocated from (ms_sm!=NULL), * read the allocated segments from the space map object * into svr_allocd_segs. Since we do this while holding * svr_lock and ms_sync_lock, concurrent frees (which * would have modified the space map) will wait for us * to finish loading the spacemap, and then take the * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { - space_map_t *sm = NULL; + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); - /* - * We have to open a new space map here, because - * ms_sm's sm_length and sm_alloc may not reflect - * what's in the object contents, if we are in between - * metaslab_sync() and metaslab_sync_done(). - */ - VERIFY0(space_map_open(&sm, - spa->spa_dsl_pool->dp_meta_objset, - msp->ms_sm->sm_object, msp->ms_sm->sm_start, - msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); - space_map_update(sm); - VERIFY0(space_map_load(sm, svr->svr_allocd_segs, - SM_ALLOC)); - space_map_close(sm); - range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * When we are resuming from a paused removal (i.e. * when importing a pool with a removal in progress), * discard any state that we have already processed. */ range_tree_clear(svr->svr_allocd_segs, 0, start_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", avl_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit && !range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * This delay will pause the removal around the point * specified by zfs_remove_max_bytes_pause. We do this * solely from the test suite or during debugging. */ uint64_t bytes_copied = spa->spa_removing_phys.sr_copied; for (int i = 0; i < TXG_SIZE; i++) bytes_copied += svr->svr_bytes_done[i]; while (zfs_remove_max_bytes_pause <= bytes_copied && !svr->svr_thread_exit) delay(hz); mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { cv_wait(&vca.vca_cv, &vca.vca_lock); } mutex_exit(&vca.vca_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. The vdev_t * that we're removing may have changed, e.g. due * to a vdev_attach or vdev_detach. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) max_alloc = zfs_remove_max_segment; last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); } } mutex_exit(&svr->svr_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Wait for all copies to finish before cleaning up the vca. */ txg_wait_synced(spa->spa_dsl_pool, 0); ASSERT0(vca.vca_outstanding_bytes); mutex_destroy(&vca.vca_lock); cv_destroy(&vca.vca_cv); if (svr->svr_thread_exit) { mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } } void spa_vdev_remove_suspend(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; mutex_enter(&svr->svr_lock); svr->svr_thread_exit = B_TRUE; while (svr->svr_thread != NULL) cv_wait(&svr->svr_cv, &svr->svr_lock); svr->svr_thread_exit = B_FALSE; mutex_exit(&svr->svr_lock); } /* ARGSUSED */ static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (0); } /* * Cancel a removal by freeing all entries from the partial mapping * and marking the vdev as no longer being removing. */ /* ARGSUSED */ static void spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; ASSERT3P(svr->svr_thread, ==, NULL); spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (vdev_obsolete_counts_are_precise(vd)) { spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); } if (vdev_obsolete_sm_object(vd) != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); space_map_free(vd->vdev_obsolete_sm, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&svr->svr_new_segments[i])); ASSERT3U(svr->svr_max_offset_to_sync[i], <=, vdev_indirect_mapping_max_offset(vim)); } for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) ASSERT0(range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) ASSERT0(range_tree_space(msp->ms_defer[i])); ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { - /* - * Assert that the in-core spacemap has the same - * length as the on-disk one, so we can use the - * existing in-core spacemap to load it from disk. - */ - ASSERT3U(msp->ms_sm->sm_alloc, ==, - msp->ms_sm->sm_phys->smp_alloc); - ASSERT3U(msp->ms_sm->sm_length, ==, - msp->ms_sm->sm_phys->smp_objsize); - mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * Clear everything past what has been synced, * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > syncd) range_tree_clear(svr->svr_allocd_segs, syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } mutex_exit(&msp->ms_lock); mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, free_mapped_segment_cb, vd); mutex_exit(&svr->svr_lock); } /* * Note: this must happen after we invoke free_mapped_segment_cb, * because it adds to the obsolete_segments. */ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); ASSERT3U(vic->vic_mapping_object, ==, vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = 0; ASSERT3U(vic->vic_births_object, ==, vdev_indirect_births_object(vd->vdev_indirect_births)); vdev_indirect_births_close(vd->vdev_indirect_births); vd->vdev_indirect_births = NULL; vdev_indirect_births_free(mos, vic->vic_births_object, tx); vic->vic_births_object = 0; /* * We may have processed some frees from the removing vdev in this * txg, thus increasing svr_bytes_done; discard that here to * satisfy the assertions in spa_vdev_removal_destroy(). * Note that future txg's can not have any bytes_done, because * future TXG's are only modified from open context, and we have * already shut down the copying thread. */ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", vd->vdev_id, dmu_tx_get_txg(tx)); spa_history_log_internal(spa, "vdev remove canceled", tx, "%s vdev %llu %s", spa_name(spa), vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } int spa_vdev_remove_cancel(spa_t *spa) { spa_vdev_remove_suspend(spa); if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); if (error == 0) { spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_t *vd = vdev_lookup_top(spa, vdid); metaslab_group_activate(vd->vdev_mg); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } return (error); } -/* - * Called every sync pass of every txg if there's a svr. - */ void svr_sync(spa_t *spa, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * This check is necessary so that we do not dirty the * DIRECTORY_OBJECT via spa_sync_removing_state() when there * is nothing to do. Dirtying it every time would prevent us * from syncing-to-convergence. */ if (svr->svr_bytes_done[txgoff] == 0) return; /* * Update progress accounting. */ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; svr->svr_bytes_done[txgoff] = 0; spa_sync_removing_state(spa, tx); } static void vdev_remove_make_hole_and_free(vdev_t *vd) { uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); if (last_vdev) { vdev_compact_children(rvd); } else { vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a log device. The config lock is held for the specified TXG. */ static int spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; int error = 0; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the + * Evacuate the device. We don't hold the config lock as + * writer since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_reset_logs(spa); - } + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); + vdev_metaslab_fini(vd); + spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* Make sure these changes are sync'ed */ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); /* Stop initializing */ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* The top ZAP should have been destroyed by vdev_remove_empty. */ ASSERT0(vd->vdev_top_zap); /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ ASSERT0(vd->vdev_leaf_zap); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); + + ASSERT0(vd->vdev_stat.vs_alloc); /* * Clean up the vdev namespace. */ vdev_remove_make_hole_and_free(vd); if (ev != NULL) spa_event_post(ev); return (0); } static int spa_vdev_remove_top_check(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd != vd->vdev_top) return (SET_ERROR(ENOTSUP)); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); /* available space in the pool's normal class */ uint64_t available = dsl_dir_space_available( spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); metaslab_class_t *mc = vd->vdev_mg->mg_class; /* * When removing a vdev from an allocation class that has * remaining vdevs, include available space from the class. */ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { uint64_t class_avail = metaslab_class_get_space(mc) - metaslab_class_get_alloc(mc); /* add class space, adjusted for overhead */ available += (class_avail * 94) / 100; } /* * There has to be enough free space to remove the * device and leave double the "slop" space (i.e. we * must leave at least 3% of the pool free, in addition to * the normal slop space). */ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { return (SET_ERROR(ENOSPC)); } /* * There can not be a removal in progress. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); /* * The device must have all its data. */ if (!vdev_dtl_empty(vd, DTL_MISSING) || !vdev_dtl_empty(vd, DTL_OUTAGE)) return (SET_ERROR(EBUSY)); /* * The device must be healthy. */ if (!vdev_readable(vd)) return (SET_ERROR(EIO)); /* * All vdevs in normal class must have the same ashift. */ if (spa->spa_max_ashift != spa->spa_min_ashift) { return (SET_ERROR(EINVAL)); } /* * All vdevs in normal class must have the same ashift * and not be raidz. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); if (cvd->vdev_ops == &vdev_indirect_ops) num_indirect++; if (!vdev_is_concrete(cvd)) continue; if (cvd->vdev_ops == &vdev_raidz_ops) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only */ if (cvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < cvd->vdev_children; cid++) { vdev_t *tmp = cvd->vdev_child[cid]; if (!tmp->vdev_ops->vdev_op_leaf) return (SET_ERROR(EINVAL)); } } } return (0); } /* * Initiate removal of a top-level vdev, reducing the total space in the pool. * The config lock is held for the specified TXG. Once initiated, * evacuation of all allocated space (copying it to other vdevs) happens * in the background (see spa_vdev_remove_thread()), and can be canceled * (see spa_vdev_remove_cancel()). If successful, the vdev will * be transformed to an indirect vdev (see spa_vdev_remove_complete()). */ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; int error; /* * Check for errors up-front, so that we don't waste time * passivating the metaslab group and clearing the ZIL if there * are errors. */ error = spa_vdev_remove_top_check(vd); if (error != 0) return (error); /* * Stop allocating from this vdev. Note that we must check * that this is not the only device in the pool before * passivating, otherwise we will not be able to make * progress because we can't allocate from any vdevs. * The above check for sufficient free space serves this * purpose. */ metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * We must ensure that no "stubby" log blocks are allocated * on the device to be removed. These blocks could be * written at any time, including while we are in the middle * of copying them. */ error = spa_reset_logs(spa); /* * We stop any initializing that is currently in progress but leave * the state as "active". This will allow the initializing to resume * if the removal is canceled sometime later. */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); *txg = spa_vdev_config_enter(spa); /* * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ if (error == 0) error = spa_vdev_remove_top_check(vd); if (error != 0) { metaslab_group_activate(mg); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); return (error); } vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); return (0); } /* * Remove a device from the pool. * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_SPARE, nvstr); if (vd == NULL) vd = spa_lookup_by_guid(spa, guid, B_TRUE); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); error = spa_vdev_remove_log(vd, &txg); } else if (vd != NULL) { ASSERT(!locked); error = spa_vdev_remove_top(vd, &txg); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); if (ev != NULL) { if (error != 0) { spa_event_discard(ev); } else { spa_event_post(ev); } } return (error); } int spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) { prs->prs_state = spa->spa_removing_phys.sr_state; if (prs->prs_state == DSS_NONE) return (SET_ERROR(ENOENT)); prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; prs->prs_start_time = spa->spa_removing_phys.sr_start_time; prs->prs_end_time = spa->spa_removing_phys.sr_end_time; prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; prs->prs_copied = spa->spa_removing_phys.sr_copied; if (spa->spa_vdev_removal != NULL) { for (int i = 0; i < TXG_SIZE; i++) { prs->prs_copied += spa->spa_vdev_removal->svr_bytes_done[i]; } } prs->prs_mapping_memory = 0; uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != -1) { vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); indirect_vdev_id = vic->vic_prev_indirect_vdev; } return (0); }