Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 275810) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 275811) @@ -1,3707 +1,3707 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef ZFS_MAXNAMELEN #undef verify #include #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \ DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES)) #ifndef lint extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; #else boolean_t zfs_recover; uint64_t zfs_arc_max, zfs_arc_meta_limit; int zfs_vdev_async_read_max_active; #endif const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; libzfs_handle_t *g_zfs; uint64_t max_inflight = 1000; /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init() { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n" " %s [-divPA] [-e -p path...] [-U config] dataset " "[object...]\n" " %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " "poolname [vdev [metaslab...]]\n" " %s -R [-A] [-e [-p path...]] poolname " "vdev:offset:size[:flags]\n" " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" " %s -l [-uA] device\n" " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -l dump label contents\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf) { if (dump_opt['P']) (void) sprintf(buf, "%llu", (longlong_t)num); else nicenum(num, buf); } const char histo_stars[] = "****************************************"; const int histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; int i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { (void) printf("%s", (char *)prop); } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; int i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (int c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && !vd->vdev_removing) { for (int m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (int c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { uint64_t alloc, offset, entry; char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; /* * Print out the freelist entries in both encoded and decoded form. */ alloc = 0; for (offset = 0; offset < space_map_length(sm); offset += sizeof (entry)) { uint8_t mapshift = sm->sm_shift; VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), ddata[SM_DEBUG_ACTION_DECODE(entry)], (u_longlong_t)SM_DEBUG_TXG_DECODE(entry), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry)); } else { (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx\n", (u_longlong_t)(offset / sizeof (entry)), SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', (u_longlong_t)((SM_OFFSET_DECODE(entry) << mapshift) + sm->sm_start), (u_longlong_t)((SM_OFFSET_DECODE(entry) << mapshift) + sm->sm_start + (SM_RUN_DECODE(entry) << mapshift)), (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); if (SM_TYPE_DECODE(entry) == SM_ALLOC) alloc += SM_RUN_DECODE(entry) << mapshift; else alloc -= SM_RUN_DECODE(entry) << mapshift; } } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%llu) INCONSISTENT " "with space map summary (%llu)\n", (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_tree; avl_tree_t *t = &msp->ms_size_tree; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; zdb_nicenum(metaslab_block_maxsize(msp), maxbuf); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); metaslab_load_wait(msp); if (!msp->ms_loaded) { VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_tree); } dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); mutex_exit(&msp->ms_lock); } } static void print_vdev_metaslab_header(vdev_t *vd) { (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", (u_longlong_t)vd->vdev_id, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %10s\n", "---------------", "-------------------", "---------------", "-------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_objects > 0) { c = zopt_object[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_objects > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_objects; m++) { if (zopt_object[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_object[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_object[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT(error == ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total = { 0 }; ddt_stat_t dds_total = { 0 }; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); mutex_enter(rt->rt_lock); range_tree_walk(rt, dump_dtl_seg, prefix); mutex_exit(rt->rt_lock); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (int c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } /* from spa_history.c: spa_history_create_obj() */ #define HIS_BUF_LEN_DEF (128 << 10) #define HIS_BUF_LEN_MAX (1 << 30) static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf = NULL; uint64_t bufsize = HIS_BUF_LEN_DEF; uint64_t resid, len, off = 0; uint_t num = 0; int error; time_t tsec; struct tm t; char tbuf[30]; char internalstr[MAXPATHLEN]; if ((buf = malloc(bufsize)) == NULL) (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); do { len = bufsize; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; /* * If the history block is too big, double the buffer * size and try again. */ if (resid == len) { free(buf); buf = NULL; bufsize <<= 1; if ((bufsize >= HIS_BUF_LEN_MAX) || ((buf = malloc(bufsize)) == NULL)) { (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); return; } } } while (len != 0); free(buf); (void) printf("\nHistory:\n"); for (int i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", zfs_history_event_names[ievent], txg, intstr); cmd = internalstr; } tsec = time; (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "B=%llu", (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } } static void print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); (void) arc_buf_remove_ref(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used); zdb_nicenum(ds->ds_compressed_bytes, compressed); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); zdb_nicenum(ds->ds_unique_bytes, unique); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_bpobj(bpobj_t *bpo, char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); (void) printf(" %*s: object %llu, %llu local blkptrs, " "%llu subobjs, %s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, bytes, comp, uncomp); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static void dump_deadlist(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes); zdb_nicenum(dl->dl_phys->dl_comp, comp); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp); (void) printf("\n Deadlist: %s (%s/%s comp)\n", bytes, comp, uncomp); if (dump_opt['d'] < 4) return; (void) printf("\n"); /* force the tree to be loaded */ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> ", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } } } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static boolean_t sa_loaded; sa_attr_type_t *sa_attr_table; static void fuid_table_destroy() { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; if (!sa_loaded) { uint64_t sa_attrs = 0; uint64_t version; VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version) == 0); if (version >= ZPL_VERSION_SA) { VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs) == 0); } if ((error = sa_setup(os, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table)) != 0) { (void) printf("sa_setup failed errno %d, can't " "display znode contents\n", error); return; } sa_loaded = B_TRUE; } if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error != 0) { (void) snprintf(path, sizeof (path), "\?\?\?", (u_longlong_t)object); } if (dump_opt['d'] < 3) { (void) printf("\t%s\n", path); (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; (void) printf("\tpath %s\n", path); dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bplist */ dump_none, /* bplist header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group used */ dump_zap, /* ZFS user/group quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_none, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static void dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; char bonus_size[32]; char aux[50]; int error; if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); zdb_nicenum(doi.doi_metadata_block_size, iblk); zdb_nicenum(doi.doi_data_block_size, dblk); zdb_nicenum(doi.doi_max_offset, lsize); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize); zdb_nicenum(doi.doi_bonus_size, bonus_size); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", bonus_size, "bonus", ZDB_OT_NAME(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); *print_header = 1; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } if (db != NULL) dmu_buf_rele(db, FTAG); } static char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void dump_dir(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[MAXNAMELEN]; char *type = "UNKNOWN"; int verbosity = dump_opt['d']; int print_header = 1; int i, error; dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf); if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, &print_header); (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header); object_count++; } ASSERT3U(object_count, ==, usedobjs); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf(header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf(footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) { vdev_t vd; vdev_t *vdp = &vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vdp->vdev_top = vdp; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); uberblock_t *ub = (void *)((char *)lbl + uoff); if (uberblock_verify(ub)) continue; (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, "Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); } } static void dump_label(const char *dev) { int fd; vdev_label_t label; char *path, *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; uint64_t psize, ashift; int len = strlen(dev) + 1; if (strncmp(dev, "/dev/dsk/", 9) == 0) { len++; path = malloc(len); (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9); } else { path = strdup(dev); } if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); free(path); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); free(path); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode)) { (void) printf("cannot use '%s': character device required\n", path); free(path); (void) close(fd); exit(1); } psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *config = NULL; (void) printf("--------------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("--------------------------------------------\n"); if (pread64(fd, &label, sizeof (label), vdev_label_offset(psize, l, 0)) != sizeof (label)) { (void) printf("failed to read label %d\n", l); continue; } if (nvlist_unpack(buf, buflen, &config, 0) != 0) { (void) printf("failed to unpack label %d\n", l); ashift = SPA_MINBLOCKSHIFT; } else { nvlist_t *vdev_tree = NULL; dump_nvlist(config, 4); if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; nvlist_free(config); } if (dump_opt['u']) dump_label_uberblocks(&label, ashift); } free(path); (void) close(fd); } static uint64_t num_large_blocks; /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) { int error; objset_t *os; error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os); if (error) { (void) printf("Could not open %s, error %d\n", dsname, error); return (0); } if (dmu_objset_ds(os)->ds_large_blocks) num_large_blocks++; dump_dir(os); dmu_objset_disown(os, FTAG); fuid_table_destroy(); sa_loaded = B_FALSE; return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE]; uint64_t zcb_start; uint64_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; } zdb_cb_t; static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) zb->zb_ditto_samevdev++; break; } } if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_first_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } /* ARGSUSED */ static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); } /* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, data, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; int kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); int sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4dMB/s) " "estimated time remaining: %uhr %02umin %02usec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb = { 0 }; ddt_entry_t dde; int error; while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } if (!dump_opt['L']) { ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); } } ASSERT(error == ENOENT); } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * For leak detection, we overload the metaslab * ms_tree to contain allocated segments * instead of free segments. As a result, * we can't use the normal metaslab_load/unload * interfaces. */ if (msp->ms_sm != NULL) { (void) fprintf(stderr, "\rloading space map for " "vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)c, (longlong_t)rvd->vdev_children, (longlong_t)m, (longlong_t)vd->vdev_ms_count); msp->ms_ops = &zdb_metaslab_ops; /* * We don't want to spend the CPU * manipulating the size-ordered * tree, so clear the range_tree * ops. */ msp->ms_tree->rt_ops = NULL; VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); msp->ms_loaded = B_TRUE; } mutex_exit(&msp->ms_lock); } } (void) fprintf(stderr, "\n"); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static void zdb_leak_fini(spa_t *spa) { if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); /* * The ms_tree has been overloaded to * contain allocated segments. Now that we * finished traversing all blocks, any * block that remains in the ms_tree * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_tree. */ range_tree_vacate(msp->ms_tree, zdb_leak, vd); msp->ms_loaded = B_FALSE; mutex_exit(&msp->ms_lock); } } } } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb = { 0 }; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; boolean_t leaks = B_FALSE; (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * Load all space maps as SM_ALLOC maps, then traverse the pool * claiming each block we discover. If the pool is perfectly * consistent, the space maps will be empty when we're done. * Anything left over is a leak; any block we can't claim (because * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (int i = 0; i < max_ncpus; i++) { (void) zio_wait(spa->spa_async_zio_root[i]); spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (int e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ zdb_leak_fini(spa); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize; if (total_found == total_alloc) { if (!dump_opt['L']) (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\tbp count: %10llu\n", (u_longlong_t)tzb->zb_count); (void) printf("\tganged count: %10llu\n", (longlong_t)tzb->zb_gangs); (void) printf("\tbp logical: %10llu avg: %6llu\n", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\tbp physical: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\tbp allocated: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\tbp deduped: %10llu ref>1:" " %6llu deduplication: %6.2f\n", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; char *typename; if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize); zdb_nicenum(zb->zb_lsize, lsize); zdb_nicenum(zb->zb_psize, psize); zdb_nicenum(zb->zb_asize, asize); zdb_nicenum(zb->zb_asize / zb->zb_count, avg); zdb_nicenum(zb->zb_gangs, gang); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total = { 0 }; ddt_stat_t dds_total = { 0 }; avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { uint64_t refcount; dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, "Pool snapshot frees", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount); if (num_large_blocks != refcount) { (void) printf("large_blocks feature refcount mismatch: " "expected %lld != actual %lld\n", (longlong_t)num_large_blocks, (longlong_t)refcount); rc = 2; } else { (void) printf("Verified large_blocks feature refcount " "is correct (%llu)\n", (longlong_t)refcount); } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc != 0) exit(rc); } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_PHYS 0x0020 #define ZDB_FLAG_RAW 0x0040 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); (void) write(1, buf, size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; int nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); int i, j; char *hdr, *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the heirarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, char *path) { char *s, *p, *q; int i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = (int)strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i < 0 || i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (*s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:size[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; void *pbuf, *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; dup = strdup(thing); s = strtok(dup, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); size = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); flagstr = s ? s : ""; s = NULL; if (size == 0) s = "size must not be zero"; if (!IS_P2ALIGNED(size, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); free(dup); return; } for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { for (i = 0; flagstr[i]; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Invalid flag: %c\n", flagstr[i]); continue; } flags |= bit; /* If it's not something with an argument, keep going */ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; p = &flagstr[i + 1]; if (bit == ZDB_FLAG_PRINT_BLKPTR) blkptr_offset = strtoull(p, &p, 16); if (*p != ':' && *p != '\0') { (void) printf("***Invalid flag arg: '%s'\n", s); free(dup); return; } i += p - &flagstr[i + 1]; /* skip over the number */ } } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } psize = size; lsize = size; pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } if (flags & ZDB_FLAG_DECOMPRESS) { /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ enum zio_compress c; void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); bcopy(pbuf, pbuf2, psize); VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, SPA_MAXBLOCKSIZE - psize) == 0); VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, SPA_MAXBLOCKSIZE - psize) == 0); for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; lsize -= SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { if (zio_decompress_data(c, pbuf, lbuf, psize, lsize) == 0 && zio_decompress_data(c, pbuf2, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (c != ZIO_COMPRESS_FUNCTIONS) break; lsize -= SPA_MINBLOCKSIZE; } umem_free(pbuf2, SPA_MAXBLOCKSIZE); umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize <= psize) { (void) printf("Decompress of %s failed\n", thing); goto out; } buf = lbuf; size = lsize; } else { buf = pbuf; size = psize; } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, size, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, size, flags); out: umem_free(pbuf, SPA_MAXBLOCKSIZE); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static boolean_t pool_match(nvlist_t *cfg, char *tgt) { uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; if (guid != 0) { if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) return (v == guid); } else { if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) return (strcmp(s, tgt) == 0); } return (B_FALSE); } static char * find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv) { nvlist_t *pools; nvlist_t *match = NULL; char *name = NULL; char *sepp = NULL; char sep; int count = 0; importargs_t args = { 0 }; args.paths = dirc; args.path = dirv; args.can_be_active = B_TRUE; if ((sepp = strpbrk(*target, "/@")) != NULL) { sep = *sepp; *sepp = '\0'; } pools = zpool_search_import(g_zfs, &args); if (pools != NULL) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, configp) == 0); if (pool_match(*configp, *target)) { count++; if (match != NULL) { /* print previously found config */ if (name != NULL) { (void) printf("%s\n", name); dump_nvlist(match, 8); name = NULL; } (void) printf("%s\n", nvpair_name(elem)); dump_nvlist(*configp, 8); } else { match = *configp; name = nvpair_name(elem); } } } } if (count > 1) (void) fatal("\tMatched %d pools - use pool GUID " "instead of pool name or \n" "\tpool name part of a dataset name to select pool", count); if (sepp) *sepp = sep; /* * If pool GUID was specified for pool id, replace it with pool name */ if (name && (strstr(*target, name) != *target)) { int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0); *target = umem_alloc(sz, UMEM_NOFAIL); (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : ""); } *configp = name ? match : NULL; return (name); } int main(int argc, char **argv) { int i, c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int rewind = ZPOOL_NEVER_REWIND; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); while ((c = getopt(argc, argv, "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': case 'd': case 'h': case 'i': case 'l': case 'm': case 's': case 'u': case 'C': case 'D': case 'M': case 'R': case 'S': dump_opt[c]++; dump_all = 0; break; case 'A': case 'F': case 'L': case 'X': case 'e': case 'P': dump_opt[c]++; break; case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " "of inflight I/Os must be greater " "than 0\n"); usage(); } break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; break; case 'v': verbose++; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; kernel_init(FREAD); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && !strchr("elAFLRSXP", c)) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) { dump_label(argv[0]); return (0); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (dump_opt['e']) { nvlist_t *cfg = NULL; char *name = find_zpool(&target, &cfg, nsearch, searchdirs); error = ENOENT; if (name) { if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } if (nvlist_add_nvlist(cfg, ZPOOL_REWIND_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if ((error = spa_import(name, cfg, NULL, ZFS_IMPORT_MISSING_LOG)) != 0) { error = spa_import(name, cfg, NULL, ZFS_IMPORT_VERBATIM); } } } if (error == 0) { if (strpbrk(target, "/@") == NULL || dump_opt['R']) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else { error = dmu_objset_own(target, DMU_OST_ANY, B_TRUE, FTAG, &os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); argv++; argc--; if (!dump_opt['R']) { if (argc > 0) { zopt_objects = argc; zopt_object = calloc(zopt_objects, sizeof (uint64_t)); for (i = 0; i < zopt_objects; i++) { errno = 0; zopt_object[i] = strtoull(argv[i], NULL, 0); if (zopt_object[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_dir(os); } else if (zopt_objects > 0 && !dump_opt['m']) { dump_dir(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; for (i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); fuid_table_destroy(); sa_loaded = B_FALSE; libzfs_fini(g_zfs); kernel_fini(); return (0); } Index: head/cddl/contrib/opensolaris =================================================================== --- head/cddl/contrib/opensolaris (revision 275810) +++ head/cddl/contrib/opensolaris (revision 275811) Property changes on: head/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/illumos/dist:r275783 Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 275811) @@ -1,5801 +1,5785 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexs, rather they rely on the * hash table mutexs for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexs). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each arc state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an arc list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Arc buffers may have an associated eviction callback function. * This function will be invoked prior to removing the buffer (e.g. * in arc_do_user_evicts()). Note however that the data associated * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is * protected from simultaneous callbacks from arc_clear_callback() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include #include #include #include #include #include #include #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif #endif /* illumos */ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; /* * The number of iterations through arc_evict_*() before we * drop & reacquire the lock. */ int arc_evict_iterations = 100; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; static int arc_dead; extern int zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); #ifdef _KERNEL static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_pageout_wakeup_thresh; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), sysctl_vfs_zfs_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim"); static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) { u_int val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * Must be declared here, before the definition of corresponding kstat * macro which uses the same names will confuse the compiler. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_meta_limit, "QU", "ARC metadata limit"); #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ #define ARCS_LOCK_PAD CACHE_LINE_SIZE struct arcs_lock { kmutex_t arcs_lock; #ifdef _KERNEL unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; #endif }; /* * must be power of two for mask use to work * */ #define ARC_BUFC_NUMDATALISTS 16 #define ARC_BUFC_NUMMETADATALISTS 16 #define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); } arc_state_t; #define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; kstat_named_t arcstat_stolen; kstat_named_t arcstat_recycle_miss; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped because they have I/O in progress, are * indrect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_compress_successes; kstat_named_t arcstat_l2_compress_zeros; kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; kstat_named_t arcstat_l2_write_in_l2; kstat_named_t arcstat_l2_write_hdr_io_in_progress; kstat_named_t arcstat_l2_write_not_cacheable; kstat_named_t arcstat_l2_write_full; kstat_named_t arcstat_l2_write_buffer_iter; kstat_named_t arcstat_l2_write_pios; kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "stolen", KSTAT_DATA_UINT64 }, { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_compress_successes", KSTAT_DATA_UINT64 }, { "l2_compress_zeros", KSTAT_DATA_UINT64 }, { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, { "l2_write_in_l2", KSTAT_DATA_UINT64 }, { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, { "l2_write_full", KSTAT_DATA_UINT64 }, { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define L2ARC_IS_VALID_COMPRESS(_c_) \ ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; uint64_t b_cksum0; kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - uint32_t b_flags; + arc_flags_t b_flags; uint32_t b_datacnt; arc_callback_t *b_acb; kcondvar_t b_cv; /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; list_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; l2arc_buf_hdr_t *b_l2hdr; list_node_t b_l2node; }; #ifdef _KERNEL static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = arc_meta_limit; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val <= 0 || val > arc_c_max) return (EINVAL); arc_meta_limit = val; return (0); } #endif static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); -static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); -#ifdef illumos -static void arc_buf_watch(arc_buf_t *buf); -#endif /* illumos */ -static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) \ + ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ -#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ -#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ -#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ -#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) -#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) - /* * Other sizes */ #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines */ #define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size, 0, "size of mru state"); /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_t *l2rcb_buf; /* read buffer */ spa_t *l2rcb_spa; /* spa */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ /* compression applied to buffer data */ enum zio_compress b_compress; /* real alloc'd buffer size depending on b_compress applied */ int b_asize; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; }; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; void (*l2df_func)(void *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void l2arc_read_done(zio_t *zio); +static void arc_get_data_buf(arc_buf_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static int arc_evict_needed(arc_buf_contents_t); +static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); +static void arc_buf_watch(arc_buf_t *); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); -static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, - enum zio_compress c); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); +static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); +static void l2arc_release_cdata_buf(arc_buf_hdr_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0 && \ (buf)->b_cksum0 == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; hdr->b_cksum0 = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; + arc_buf_hdr_t *hdr; mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; - return (buf); + return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; + arc_buf_hdr_t *fhdr; uint32_t i; - ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); - ASSERT(buf->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(buf)); + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); *lockp = hash_lock; mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); } - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void -buf_hash_remove(arc_buf_hdr_t *buf) +buf_hash_remove(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); + ASSERT(HDR_IN_HASH_TABLE(hdr)); - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT(fhdr != NULL); + hdrp = &fhdr->b_hash_next; } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + bzero(hdr, sizeof (arc_buf_hdr_t)); + refcount_create(&hdr->b_refcnt); + cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(buf)); - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); - mutex_destroy(&buf->b_freeze_lock); + ASSERT(BUF_EMPTY(hdr)); + refcount_destroy(&hdr->b_refcnt); + cv_destroy(&hdr->b_cv); + mutex_destroy(&hdr->b_freeze_lock); arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thr_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ static void arc_cksum_verify(arc_buf_t *buf) { zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_freeze_lock); } static int arc_cksum_equal(arc_buf_t *buf) { zio_cksum_t zc; int equal; mutex_enter(&buf->b_hdr->b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_freeze_lock); return (equal); } static void arc_cksum_compute(arc_buf_t *buf, boolean_t force) { if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif /* illumos */ } #ifdef illumos #ifndef _KERNEL typedef struct procctl { long cmd; prwatch_t prwatch; } procctl_t; #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = 0; ctl.prwatch.pr_wflags = 0; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = buf->b_hdr->b_size; ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } #endif /* illumos */ void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_thawed) kmem_free(buf->b_hdr->b_thawed, 1); buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); } mutex_exit(&buf->b_hdr->b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ } void arc_buf_freeze(arc_buf_t *buf) { kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); } static void -get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) +get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) { - uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); + uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - if (ab->b_type == ARC_BUFC_METADATA) + if (hdr->b_type == ARC_BUFC_METADATA) buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); else { buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); buf_hashid += ARC_BUFC_NUMMETADATALISTS; } *list = &state->arcs_lists[buf_hashid]; *lock = ARCS_LOCK(state, buf_hashid); } static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; + if ((refcount_add(&hdr->b_refcnt, tag) == 1) && + (hdr->b_state != arc_anon)) { + uint64_t delta = hdr->b_size * hdr->b_datacnt; + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; list_t *list; kmutex_t *lock; - get_buf_info(ab, ab->b_state, &list, &lock); + get_buf_info(hdr, hdr->b_state, &list, &lock); ASSERT(!MUTEX_HELD(lock)); mutex_enter(lock); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT0(ab->b_datacnt); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(hdr->b_state)) { + ASSERT0(hdr->b_datacnt); + ASSERT3P(hdr->b_buf, ==, NULL); + delta = hdr->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); mutex_exit(lock); /* remove the prefetch flag if we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; + if (hdr->b_flags & ARC_FLAG_PREFETCH) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = ab->b_state; + arc_state_t *state = hdr->b_state; ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[ab->b_type]; + uint64_t *size = &state->arcs_lsize[hdr->b_type]; list_t *list; kmutex_t *lock; - get_buf_info(ab, state, &list, &lock); + get_buf_info(hdr, state, &list, &lock); ASSERT(!MUTEX_HELD(lock)); mutex_enter(lock); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(list, ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(size, ab->b_size * ab->b_datacnt); + ASSERT(!list_link_active(&hdr->b_arc_node)); + list_insert_head(list, hdr); + ASSERT(hdr->b_datacnt > 0); + atomic_add_64(size, hdr->b_size * hdr->b_datacnt); mutex_exit(lock); } return (cnt); } /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) { - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); + arc_state_t *old_state = hdr->b_state; + int64_t refcnt = refcount_count(&hdr->b_refcnt); uint64_t from_delta, to_delta; list_t *list; kmutex_t *lock; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || hdr->b_datacnt > 0); + ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); - from_delta = to_delta = ab->b_datacnt * ab->b_size; + from_delta = to_delta = hdr->b_datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex; - uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; - get_buf_info(ab, old_state, &list, &lock); + get_buf_info(hdr, old_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) mutex_enter(lock); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(list, hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; + ASSERT(hdr->b_buf == NULL); + from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(lock); } if (new_state != arc_anon) { int use_mutex; - uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; - get_buf_info(ab, new_state, &list, &lock); + get_buf_info(hdr, new_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) mutex_enter(lock); - list_insert_head(list, ab); + list_insert_head(list, hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; + ASSERT(hdr->b_datacnt == 0); + ASSERT(hdr->b_buf == NULL); + to_delta = hdr->b_size; } atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(lock); } } - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); + ASSERT(!BUF_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); /* adjust state sizes */ if (to_delta) atomic_add_64(&new_state->arcs_size, to_delta); if (from_delta) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - ab->b_state = new_state; + hdr->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) l2arc_hdr_stat_add(); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } arc_buf_t * arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; arc_get_data_buf(buf); hdr->b_datacnt = 1; hdr->b_flags = 0; ASSERT(refcount_is_zero(&hdr->b_refcnt)); (void) refcount_add(&hdr->b_refcnt, tag); return (buf); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); (void) refcount_add(&hdr->b_refcnt, tag); (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; ASSERT(buf->b_data != NULL); hdr = buf->b_hdr; (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); } static arc_buf_t * arc_buf_clone(arc_buf_t *from) { arc_buf_t *buf; arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; ASSERT(hdr->b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = hdr->b_buf; hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); /* * This buffer already exists in the arc so create a duplicate * copy for the caller. If the buffer is associated with user data * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ if (hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } hdr->b_datacnt += 1; return (buf); } void arc_buf_add_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; /* * Check to see if this buffer is evicted. Callers * must verify b_data != NULL to know if the add_ref * was successful. */ mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { mutex_exit(&buf->b_evict_lock); return; } hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); } static void arc_buf_free_on_write(void *data, size_t size, void (*free_func)(void *, size_t)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); } } /* * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; ASSERT(MUTEX_HELD(&l2arc_buflist_mtx)); if (l2hdr->b_tmp_cdata == NULL) return; ASSERT(HDR_L2_WRITING(hdr)); arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, zio_data_buf_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); l2hdr->b_tmp_cdata = NULL; } static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) { arc_buf_t **bufp; /* free up data associated with the buf */ if (buf->b_data) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf, zio_buf_free); arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf, zio_data_buf_free); ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } if (list_link_active(&buf->b_hdr->b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; /* * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ if (buf->b_hdr->b_datacnt > 1 && buf->b_hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } /* only remove the buf if requested */ if (!remove) return; /* remove the buf from the hdr list */ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; if (l2hdr != NULL) { boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); /* * To prevent arc_free() and l2arc_evict() from * attempting to free the same buffer at the same time, * a FREE_IN_PROGRESS flag is given to arc_free() to * give it priority. l2arc_evict() can't destroy this * header while we are waiting on l2arc_buflist_mtx. * * The hdr may be removed from l2ad_buflist before we * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. */ if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); l2hdr = hdr->b_l2hdr; } if (l2hdr != NULL) { trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, hdr->b_size, 0); list_remove(l2hdr->b_dev->l2ad_buflist, hdr); arc_buf_l2_cdata_free(hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; } if (!buflist_held) mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); } } if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } if (hdr->b_thawed) { kmem_free(hdr->b_thawed, 1); hdr->b_thawed = NULL; } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); ASSERT3P(hdr->b_acb, ==, NULL); kmem_cache_free(hdr_cache, hdr); } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; int hashed = hdr->b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); if (hashed) { kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; /* * We are in the middle of an async write. Don't destroy * this buffer unless the write completes before we finish * decrementing the reference count. */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); else arc_hdr_destroy(hdr); } } boolean_t arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || refcount_is_zero(&hdr->b_refcnt)); mutex_exit(hash_lock); return (no_callback); } int arc_buf_size(arc_buf_t *buf) { return (buf->b_hdr->b_size); } /* * Called from the DMU to determine if the current buffer should be * evicted. In order to ensure proper locking, the eviction must be initiated * from the DMU. Return true if the buffer is associated with user data and * duplicate buffers still exist. */ boolean_t arc_buf_eviction_needed(arc_buf_t *buf) { arc_buf_hdr_t *hdr; boolean_t evict_needed = B_FALSE; if (zfs_disable_dup_eviction) return (B_FALSE); mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(); let that function * perform the eviction. */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (B_FALSE); } else if (buf->b_data == NULL) { /* * We have already been added to the arc eviction list; * recommend eviction. */ ASSERT3P(hdr, ==, &arc_eviction_hdr); mutex_exit(&buf->b_evict_lock); return (B_TRUE); } if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); return (evict_needed); } /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: * - look for a buffer to evict that is `bytes' long. * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ static void * arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; int64_t bytes_remaining; - arc_buf_hdr_t *ab, *ab_prev = NULL; + arc_buf_hdr_t *hdr, *hdr_prev = NULL; list_t *evicted_list, *list, *evicted_list_start, *list_start; kmutex_t *lock, *evicted_lock; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; arc_buf_hdr_t marker = { 0 }; int count = 0; static int evict_metadata_offset, evict_data_offset; int i, idx, offset, list_count, lists; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* * Decide which "type" (data vs metadata) to recycle from. * * If we are over the metadata limit, recycle from metadata. * If we are under the metadata minimum, recycle from data. * Otherwise, recycle from whichever type has the oldest (least * recently accessed) header. This is not yet implemented. */ if (recycle) { arc_buf_contents_t realtype; if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { realtype = ARC_BUFC_METADATA; } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { realtype = ARC_BUFC_DATA; } else if (arc_meta_used >= arc_meta_limit) { realtype = ARC_BUFC_METADATA; } else if (arc_meta_used <= arc_meta_min) { realtype = ARC_BUFC_DATA; } else { #ifdef illumos if (data_hdr->b_arc_access < metadata_hdr->b_arc_access) { realtype = ARC_BUFC_DATA; } else { realtype = ARC_BUFC_METADATA; } #else /* TODO */ realtype = type; #endif } if (realtype != type) { /* * If we want to evict from a different list, * we can not recycle, because DATA vs METADATA * buffers are segregated into different kmem * caches (and vmem arenas). */ type = realtype; recycle = B_FALSE; } } if (type == ARC_BUFC_METADATA) { offset = 0; list_count = ARC_BUFC_NUMMETADATALISTS; list_start = &state->arcs_lists[0]; evicted_list_start = &evicted_state->arcs_lists[0]; idx = evict_metadata_offset; } else { offset = ARC_BUFC_NUMMETADATALISTS; list_start = &state->arcs_lists[offset]; evicted_list_start = &evicted_state->arcs_lists[offset]; list_count = ARC_BUFC_NUMDATALISTS; idx = evict_data_offset; } bytes_remaining = evicted_state->arcs_lsize[type]; lists = 0; evict_start: list = &list_start[idx]; evicted_list = &evicted_list_start[idx]; lock = ARCS_LOCK(state, (offset + idx)); evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); mutex_enter(lock); mutex_enter(evicted_lock); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - bytes_remaining -= (ab->b_size * ab->b_datacnt); + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); + bytes_remaining -= (hdr->b_size * hdr->b_datacnt); /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (spa && ab->b_spa != spa) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - ddi_get_lbolt() - ab->b_arc_access < + if (HDR_IO_IN_PROGRESS(hdr) || + (spa && hdr->b_spa != spa) || + (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && + ddi_get_lbolt() - hdr->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; continue; } /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) + if (recycle && hdr->b_size != bytes && + hdr_prev && hdr_prev->b_size == bytes) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; /* * It may take a long time to evict all the bufs requested. * To avoid blocking all arc activity, periodically drop * the arcs_mtx and give other threads a chance to run * before reacquiring the lock. * * If we are looking for a buffer to recycle, we are in * the hot code path, so don't sleep. */ if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(evicted_lock); mutex_exit(lock); kpreempt(KPREEMPT_SYNC); mutex_enter(lock); mutex_enter(evicted_lock); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&ab->b_refcnt)); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; + ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT(hdr->b_datacnt > 0); + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes && - !HDR_L2_WRITING(ab)) { + bytes_evicted += hdr->b_size; + if (recycle && hdr->b_type == type && + hdr->b_size == bytes && + !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } } if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; + hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); mutex_exit(&buf->b_evict_lock); } else { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } - if (ab->b_l2hdr) { + if (hdr->b_l2hdr) { ARCSTAT_INCR(arcstat_evict_l2_cached, - ab->b_size); + hdr->b_size); } else { - if (l2arc_write_eligible(ab->b_spa, ab)) { + if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, - ab->b_size); + hdr->b_size); } else { ARCSTAT_INCR( arcstat_evict_l2_ineligible, - ab->b_size); + hdr->b_size); } } - if (ab->b_datacnt == 0) { - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags |= ARC_IN_HASH_TABLE; - ab->b_flags &= ~ARC_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (hdr->b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) break; if (bytes_remaining > 0) { mutex_exit(evicted_lock); mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); lists++; goto evict_start; } } else { missed += 1; } } mutex_exit(evicted_lock); mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); lists++; if (bytes_evicted < bytes) { if (lists < list_count) goto evict_start; else dprintf("only evicted %lld bytes from %x", (longlong_t)bytes_evicted, state); } if (type == ARC_BUFC_METADATA) evict_metadata_offset = idx; else evict_data_offset = idx; if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); /* * Note: we have just evicted some data into the ghost state, * potentially putting the ghost size over the desired size. Rather * that evicting from the ghost list in this hot code path, leave * this chore to the arc_reclaim_thread(). */ if (stolen) ARCSTAT_BUMP(arcstat_stolen); return (stolen); } /* * Remove buffers from list until we've removed the specified number of * bytes. Destroy the buffers that are removed. */ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; arc_buf_hdr_t marker = { 0 }; list_t *list, *list_start; kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; int count = 0; static int evict_offset; int list_count, idx = evict_offset; int offset, lists = 0; ASSERT(GHOST_STATE(state)); /* * data lists come after metadata lists */ list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; list_count = ARC_BUFC_NUMDATALISTS; offset = ARC_BUFC_NUMMETADATALISTS; evict_start: list = &list_start[idx]; lock = ARCS_LOCK(state, idx + offset); mutex_enter(lock); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - if (ab->b_type > ARC_BUFC_NUMTYPES) - panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); + if (hdr->b_type > ARC_BUFC_NUMTYPES) + panic("invalid hdr=%p", (void *)hdr); + if (spa && hdr->b_spa != spa) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; /* * It may take a long time to evict all the bufs requested. * To avoid blocking all arc activity, periodically drop * the arcs_mtx and give other threads a chance to run * before reacquiring the lock. */ if (count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(lock); kpreempt(KPREEMPT_SYNC); mutex_enter(lock); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; + bytes_deleted += hdr->b_size; - if (ab->b_l2hdr != NULL) { + if (hdr->b_l2hdr != NULL) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, ab, hash_lock); + arc_change_state(arc_l2c_only, hdr, hash_lock); mutex_exit(hash_lock); } else { - arc_change_state(arc_anon, ab, hash_lock); + arc_change_state(arc_anon, hdr, hash_lock); mutex_exit(hash_lock); - arc_hdr_destroy(ab); + arc_hdr_destroy(hdr); } - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (bytes >= 0 && bytes_deleted >= bytes) break; } else if (bytes < 0) { /* * Insert a list marker and then wait for the * hash lock to become available. Once its * available, restart from where we left off. */ - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(lock); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(lock); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); } else { bufs_skipped += 1; } } mutex_exit(lock); idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); lists++; if (lists < list_count) goto evict_start; evict_offset = idx; if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && (bytes < 0 || bytes_deleted < bytes)) { list_start = &state->arcs_lists[0]; list_count = ARC_BUFC_NUMMETADATALISTS; offset = lists = 0; goto evict_start; } if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); } if (bytes_deleted < bytes) dprintf("only deleted %lld bytes from %p", (longlong_t)bytes_deleted, state); } static void arc_adjust(void) { int64_t adjustment, delta; /* * Adjust MRU size */ adjustment = MIN((int64_t)(arc_size - arc_c), (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p)); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust MFU size */ adjustment = arc_size - arc_c; if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { int64_t delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust ghost lists */ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mfu_ghost, 0, delta); } } static void arc_do_user_evicts(void) { static arc_buf_t *tmp_arc_eviction_list; /* * Move list over to avoid LOR */ restart: mutex_enter(&arc_eviction_mtx); tmp_arc_eviction_list = arc_eviction_list; arc_eviction_list = NULL; mutex_exit(&arc_eviction_mtx); while (tmp_arc_eviction_list != NULL) { arc_buf_t *buf = tmp_arc_eviction_list; tmp_arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); if (buf->b_efunc != NULL) VERIFY0(buf->b_efunc(buf->b_private)); buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); } if (arc_eviction_list != NULL) goto restart; } /* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void arc_flush(spa_t *spa) { uint64_t guid = 0; if (spa) guid = spa_load_guid(spa); while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } arc_evict_ghost(arc_mru_ghost, guid, -1); arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } void arc_shrink(void) { if (arc_c > arc_c_min) { uint64_t to_free; DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); #ifdef _KERNEL to_free = arc_c >> arc_shrink_shift; #else to_free = arc_c >> arc_shrink_shift; #endif if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, arc_p); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (arc_size > arc_c) { DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, uint64_t, arc_c); arc_adjust(); } } static int needfree = 0; static int arc_reclaim_needed(void) { #ifdef _KERNEL if (needfree) { DTRACE_PROBE(arc__reclaim_needfree); return (1); } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ if (freemem < zfs_arc_free_target) { DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, freemem, uint64_t, zfs_arc_free_target); return (1); } #ifdef sun /* * take 'desfree' extra pages, so we reclaim sooner, rather than later */ extra = desfree; /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ if (freemem < lotsfree + needfree + extra) return (1); /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); /* * Check that we have enough availrmem that memory locking (e.g., via * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum * stores the number of pages that cannot be locked; when availrmem * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ if (availrmem <= pages_pp_maximum) return (1); #endif /* sun */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (vmem_size(heap_arena, VMEM_FREE) < (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { DTRACE_PROBE2(arc__reclaim_used, uint64_t, vmem_size(heap_arena, VMEM_FREE), uint64_t, (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); return (1); } #endif #ifdef sun /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains * above about 1/16th free. * * Note: The 1/16th arena free requirement was put in place * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ if (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) < (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) return (1); #endif /* sun */ #else /* _KERNEL */ if (spa_get_random(100) == 0) return (1); #endif /* _KERNEL */ DTRACE_PROBE(arc__reclaim_no); return (0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; static void __noinline arc_kmem_reap_now(arc_reclaim_strategy_t strat) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) arc_shrink(); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); kmem_cache_reap_now(range_seg_cache); #ifdef sun /* * Ask the vmem arena to reclaim unused memory from its * quantum caches. */ if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) vmem_qcache_reap(zio_arena); #endif DTRACE_PROBE(arc__kmem_reap_end); } static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { if (arc_reclaim_needed()) { if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { DTRACE_PROBE(arc__reclaim_aggr_no_grow); last_reclaim = ARC_RECLAIM_AGGR; } else { last_reclaim = ARC_RECLAIM_CONS; } } else { arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; DTRACE_PROBE(arc__reclaim_aggr); membar_producer(); } /* reset the growth delay for every reclaim */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* * If needfree is TRUE our vm_lowmem hook * was called and in that case we must free some * memory, so switch to aggressive mode. */ arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; } arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); #ifdef _KERNEL if (needfree) { needfree = 0; wakeup(&needfree); } #endif /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, hz); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } arc_thread_exit = 0; cv_broadcast(&arc_reclaim_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ thread_exit(); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thr_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if the cache has reached its limits and eviction is required * prior to insert. */ static int arc_evict_needed(arc_buf_contents_t type) { if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); if (arc_reclaim_needed()) return (1); return (arc_size > arc_c); } /* * The buffer, supplied as the first argument, needs a data block. * So, if we are at cache max, determine which cache should be victimized. * We have the following cases: * * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> * In this situation if we're out of space, but the resident size of the MFU is * under the limit, victimize the MFU cache to satisfy this insertion request. * * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> * Here, we've used up all of the available space for the MRU, so we need to * evict from our own cache instead. Evict from the set of resident MRU * entries. * * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> * c minus p represents the MFU space in the cache, since p is the size of the * cache that is dedicated to the MRU. In this situation there's still space on * the MFU side, so the MRU side needs to be victimized. * * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> * MFU's resident set is consuming more space than it has been allotted. In * this situation, we must victimize our own cache, the MFU, for this insertion. */ static void arc_get_data_buf(arc_buf_t *buf) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_adapt(size, state); /* * We have not yet reached cache maximum size, * just allocate a new buffer. */ if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; } /* * If we are prefetching from the mfu ghost list, this buffer * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? + arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(buf->b_hdr->b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); - if (buf->b_state == arc_anon) { + if (hdr->b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); + ASSERT(hdr->b_arc_access == 0); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); - } else if (buf->b_state == arc_mru) { + } else if (hdr->b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + if (refcount_count(&hdr->b_refcnt) == 0) { + ASSERT(list_link_active(&hdr->b_arc_node)); } else { - buf->b_flags &= ~ARC_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = now; + hdr->b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ - if (now > buf->b_arc_access + ARC_MINTIME) { + if (now > hdr->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { + } else if (hdr->b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + if (refcount_count(&hdr->b_refcnt) > 0) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - buf->b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { + } else if (hdr->b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + ASSERT(refcount_count(&hdr->b_refcnt) == 0); + ASSERT(list_link_active(&hdr->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = ddi_get_lbolt(); - } else if (buf->b_state == arc_mfu_ghost) { + hdr->b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&buf->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_refcnt)); new_state = arc_mru; } - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (buf->b_state == arc_l2c_only) { + } else if (hdr->b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg)); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { VERIFY(arc_buf_remove_ref(buf, arg)); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; int freeable = FALSE; buf = zio->io_private; hdr = buf->b_hdr; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); } - hdr->b_flags &= ~ARC_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) - hdr->b_flags &= ~ARC_L2CACHE; + hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); #ifdef illumos arc_buf_watch(buf); #endif /* illumos */ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { if (abuf == NULL) { ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); } acb->acb_buf = abuf; abuf = NULL; } } hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_datacnt == 1); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; + hdr->b_flags |= ARC_FLAG_IO_ERROR; if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_cv); if (hash_lock) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_phys_t *zb) + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: if (!BP_IS_EMBEDDED(bp)) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } if (hdr != NULL && hdr->b_datacnt > 0) { - *arc_flags |= ARC_CACHED; + *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); goto top; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } - } else if (*arc_flags & ARC_PREFETCH && + } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; uint64_t b_asize = 0; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } + /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { + if (*arc_flags & ARC_FLAG_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; + hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(refcount_count(&hdr->b_refcnt)); ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; + if (*arc_flags & ARC_FLAG_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREFETCH; else add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } ASSERT(!GHOST_STATE(hdr->b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; if (hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; b_compress = hdr->b_l2hdr->b_compress; b_asize = hdr->b_l2hdr->b_asize; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } if (hash_lock != NULL) mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); #ifdef _KERNEL curthread->td_ru.ru_inblock++; #endif if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + size < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ if (b_compress == ZIO_COMPRESS_EMPTY) { rzio = zio_null(pio, spa, vd, l2arc_read_done, cb, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); } else { rzio = zio_read_phys(pio, vd, addr, b_asize, buf->b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); } DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - if (*arc_flags & ARC_NOWAIT) { + if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); return (0); } - ASSERT(*arc_flags & ARC_WAIT); + ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } return (0); } void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); buf->b_efunc = func; buf->b_private = private; } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { arc_buf_t *buf = hdr->b_buf; add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); arc_release(buf, FTAG); (void) arc_buf_remove_ref(buf, FTAG); } else { mutex_exit(hash_lock); } } /* * Clear the user eviction callback set by arc_set_callback(), first calling * it if it exists. Because the presence of a callback keeps an arc_buf cached * clearing the callback may result in the arc_buf being destroyed. However, * it will not result in the *last* arc_buf being destroyed, hence the data * will remain cached in the ARC. We make a copy of the arc buffer here so * that we can process the callback without holding any locks. * * It's possible that the callback is already in the process of being cleared * by another thread. In this case we can not clear the callback. * * Returns B_TRUE if the callback was successfully called and cleared. */ boolean_t arc_clear_callback(arc_buf_t *buf) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_evict_func_t *efunc = buf->b_efunc; void *private = buf->b_private; list_t *list, *evicted_list; kmutex_t *lock, *evicted_lock; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (B_FALSE); } else if (buf->b_data == NULL) { /* * We are on the eviction list; process this buffer now * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; mutex_exit(&buf->b_evict_lock); VERIFY0(efunc(private)); return (B_TRUE); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; if (hdr->b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } mutex_exit(hash_lock); VERIFY0(efunc(private)); return (B_TRUE); } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); if (hdr->b_state == arc_anon) { /* this buffer is already released */ ASSERT(buf->b_efunc == NULL); } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); arc_buf_l2_cdata_free(hdr); hdr->b_l2hdr = NULL; list_remove(l2hdr->b_dev->l2ad_buflist, hdr); } buf_size = hdr->b_size; /* * Do we have more than one buf? */ if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } /* * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ if (hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ mutex_exit(hash_lock); nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; if (hash_lock) mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; if (l2hdr) { ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, hdr->b_size, 0); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); callback->awcb_ready(zio, buf, callback->awcb_private); /* * If the IO is already in progress, then this is a re-write * attempt, so we need to thaw and re-compute the cksum. * It is the responsibility of the callback to handle the * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { mutex_enter(&hdr->b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } mutex_exit(&hdr->b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } } else { ASSERT(BUF_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT(zio->io_error == 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero(&exists->b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_datacnt == 1); ASSERT(hdr->b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == NULL); if (l2arc) - hdr->b_flags |= ARC_L2CACHE; + hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) - hdr->b_flags |= ARC_L2COMPRESS; + hdr->b_flags |= ARC_FLAG_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); #endif if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (SET_ERROR(EAGAIN)); } page_load = 0; #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static kmutex_t arc_lowmem_lock; #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); mutex_enter(&arc_reclaim_thr_lock); needfree = 1; DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thr_cv); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) { while (needfree) msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); } mutex_exit(&arc_reclaim_thr_lock); mutex_exit(&arc_lowmem_lock); } #endif void arc_init(void) { int i, prefetch_tunable_set = 0; mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = kmem_size() / 8; #ifdef sun #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* sun */ /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<18); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1<<30) arc_c_max = (arc_c * 8) - (1<<30); else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 16MB) */ if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) arc_c_max = zfs_arc_max; if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { arc_meta_min = arc_c_min / 2; } if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_size = 0; for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { mutex_init(&arc_anon->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mru_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_l2c_only->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); } buf_init(); arc_thread_exit = 0; arc_eviction_list = NULL; mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef _KERNEL arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); #endif arc_dead = FALSE; arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by /etc/system, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4GB). */ if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = ptob(physmem) * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " "expect unstable behavior.\n"); } if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); printf(" in /boot/loader.conf.\n"); } #endif } void arc_fini(void) { int i; mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); arc_flush(NULL); arc_dead = TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { list_destroy(&arc_mru->arcs_lists[i]); list_destroy(&arc_mru_ghost->arcs_lists[i]); list_destroy(&arc_mfu->arcs_lists[i]); list_destroy(&arc_mfu_ghost->arcs_lists[i]); list_destroy(&arc_l2c_only->arcs_lists[i]); mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); } buf_fini(); ASSERT(arc_loaned_bytes == 0); mutex_destroy(&arc_lowmem_lock); #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa_guid) { + if (hdr->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } - if (ab->b_l2hdr != NULL) { + if (hdr->b_l2hdr != NULL) { ARCSTAT_BUMP(arcstat_l2_write_in_l2); return (B_FALSE); } - if (HDR_IO_IN_PROGRESS(ab)) { + if (HDR_IO_IN_PROGRESS(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); return (B_FALSE); } - if (!HDR_L2CACHE(ab)) { + if (!HDR_L2CACHE(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); } return (B_TRUE); } static uint64_t l2arc_write_size(void) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } static void l2arc_hdr_stat_add(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write() { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT(df->l2df_data != NULL); ASSERT(df->l2df_func != NULL); df->l2df_func(df->l2df_data, df->l2df_size); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; - arc_buf_hdr_t *head, *ab, *ab_prev; + arc_buf_hdr_t *head, *hdr, *hdr_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); buflist = dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); mutex_enter(&l2arc_buflist_mtx); /* * All writes completed, or an error was hit. */ - for (ab = list_prev(buflist, head); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); - abl2 = ab->b_l2hdr; + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + abl2 = hdr->b_l2hdr; /* * Release the temporary compressed buffer as soon as possible. */ if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); + l2arc_release_cdata_buf(hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage * of eviction. Its ARC_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ - list_remove(buflist, ab); + list_remove(buflist, hdr); ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_dropped += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr, - ab->b_size, 0); + hdr->b_size, 0); kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; int equal; ASSERT(zio->io_vd != NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the buffer was compressed, decompress it first. */ if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); ASSERT(zio->io_data != NULL); /* * Check this survived the L2ARC journey. */ equal = arc_cksum_equal(buf); if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = buf; zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { list_t *list = NULL; int idx; ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); if (list_num < ARC_BUFC_NUMMETADATALISTS) { idx = list_num; list = &arc_mfu->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mfu, idx); } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { idx = list_num - ARC_BUFC_NUMMETADATALISTS; list = &arc_mru->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mru, idx); } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + ARC_BUFC_NUMDATALISTS)) { idx = list_num - ARC_BUFC_NUMMETADATALISTS; list = &arc_mfu->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mfu, idx); } else { idx = list_num - ARC_BUFC_NUMLISTS; list = &arc_mru->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mru, idx); } ASSERT(!(MUTEX_HELD(*lock))); mutex_enter(*lock); return (list); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; l2arc_buf_hdr_t *abl2; - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; buflist = dev->l2ad_buflist; if (buflist == NULL) return; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&l2arc_buflist_mtx); - for (ab = list_tail(buflist); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&l2arc_buflist_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } - if (HDR_L2_WRITE_HEAD(ab)) { + if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ - list_remove(buflist, ab); + list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } - if (!all && ab->b_l2hdr != NULL && - (ab->b_l2hdr->b_daddr > taddr || - ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && hdr->b_l2hdr != NULL && + (hdr->b_l2hdr->b_daddr > taddr || + hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } - if (HDR_FREE_IN_PROGRESS(ab)) { + if (HDR_FREE_IN_PROGRESS(hdr)) { /* * Already on the path to destruction. */ mutex_exit(hash_lock); continue; } - if (ab->b_state == arc_l2c_only) { - ASSERT(!HDR_L2_READING(ab)); + if (hdr->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ - arc_change_state(arc_anon, ab, hash_lock); - arc_hdr_destroy(ab); + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } else { /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ - if (HDR_L2_READING(ab)) { + if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - ab->b_flags |= ARC_L2_EVICTED; + hdr->b_flags |= ARC_FLAG_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ - if (ab->b_l2hdr != NULL) { - abl2 = ab->b_l2hdr; + if (hdr->b_l2hdr != NULL) { + abl2 = hdr->b_l2hdr; ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; /* * We are destroying l2hdr, so ensure that * its compressed buffer, if any, is not leaked. */ ASSERT(abl2->b_tmp_cdata == NULL); kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } - list_remove(buflist, ab); + list_remove(buflist, hdr); /* * This may have been leftover after a * failed write. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } mutex_exit(&l2arc_buflist_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; } /* * Find and write ARC buffers to the L2ARC device. * - * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { - arc_buf_hdr_t *ab, *ab_prev, *head; + arc_buf_hdr_t *hdr, *hdr_prev, *head; list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; void *buf_data; kmutex_t *list_lock; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); const boolean_t do_headroom_boost = *headroom_boost; int try; ASSERT(dev->l2ad_vdev != NULL); /* Lower the flag now, we might want to raise it again later. */ *headroom_boost = B_FALSE; pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - head->b_flags |= ARC_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * We will want to try to compress buffers that are at least 2x the * device sector size. */ buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { uint64_t passed_sz = 0; list = l2arc_list_locked(try, &list_lock); ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - ab = list_head(list); + hdr = list_head(list); else - ab = list_tail(list); - if (ab == NULL) + hdr = list_tail(list); + if (hdr == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; ab; ab = ab_prev) { + for (; hdr; hdr = hdr_prev) { l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; if (arc_warm == B_FALSE) - ab_prev = list_next(list, ab); + hdr_prev = list_next(list, hdr); else - ab_prev = list_prev(list, ab); - ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); + hdr_prev = list_prev(list, hdr); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } - passed_sz += ab->b_size; + passed_sz += hdr->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } - if (!l2arc_write_eligible(guid, ab)) { + if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } - if ((write_sz + ab->b_size) > target_sz) { + if ((write_sz + hdr->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); ARCSTAT_BUMP(arcstat_l2_write_pios); } /* * Create and add a new L2ARC header. */ l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); l2hdr->b_dev = dev; - ab->b_flags |= ARC_L2_WRITING; + hdr->b_flags |= ARC_FLAG_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access ab->b_buf + * there. This is because can't access hdr->b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing). */ l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = ab->b_size; - l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_asize = hdr->b_size; + l2hdr->b_tmp_cdata = hdr->b_buf->b_data; - buf_sz = ab->b_size; - ab->b_l2hdr = l2hdr; + buf_sz = hdr->b_size; + hdr->b_l2hdr = l2hdr; - list_insert_head(dev->l2ad_buflist, ab); + list_insert_head(dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(ab->b_buf); - arc_cksum_compute(ab->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_buf); + arc_cksum_compute(hdr->b_buf, B_TRUE); mutex_exit(hash_lock); write_sz += buf_sz; } mutex_exit(list_lock); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); mutex_exit(&l2arc_buflist_mtx); kmem_cache_free(hdr_cache, head); return (0); } /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector * loop above. */ - for (ab = list_prev(dev->l2ad_buflist, head); ab; - ab = list_prev(dev->l2ad_buflist, ab)) { + for (hdr = list_prev(dev->l2ad_buflist, head); hdr; + hdr = list_prev(dev->l2ad_buflist, hdr)) { l2arc_buf_hdr_t *l2hdr; uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_L2_WRITING in the previous step, but we must take - * care to only access its L2 cache parameters. In particular, - * ab->b_buf may be invalid by now due to ARC eviction. + * it as ARC_FLAG_L2_WRITING in the previous step, but we must + * take care to only access its L2 cache parameters. In + * particular, hdr->b_buf may be invalid by now due to + * ARC eviction. */ - l2hdr = ab->b_l2hdr; + l2hdr = hdr->b_l2hdr; l2hdr->b_daddr = dev->l2ad_hand; - if ((ab->b_flags & ARC_L2COMPRESS) && + if ((hdr->b_flags & ARC_FLAG_L2COMPRESS) && l2hdr->b_asize >= buf_compress_minsz) { if (l2arc_compress_buf(l2hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. */ *headroom_boost = B_TRUE; } } /* * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ buf_data = l2hdr->b_tmp_cdata; buf_sz = l2hdr->b_asize; /* * If the data has not been compressed, then clear b_tmp_cdata * to make sure that it points only to a temporary compression * buffer. */ if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)) l2hdr->b_tmp_cdata = NULL; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_p_sz; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); write_asize += buf_sz; /* * Keep the clock hand suitably device-aligned. */ buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_psize += buf_p_sz; dev->l2ad_hand += buf_p_sz; } } mutex_exit(&l2arc_buflist_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_asize); } /* * Compresses an L2ARC buffer. * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left * untouched and are ready for writing to an L2 device. * *) The buffer was all-zeros, so there is no need to write it to an L2 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. * *) Compression succeeded and b_tmp_cdata was replaced with a temporary * data buffer which holds the compressed data to be written, and b_asize * tells us how much data there is. b_compress is set to the appropriate * compression algorithm. Once writing is done, invoke * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. * * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the * buffer was incompressible). */ static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) { void *cdata; size_t csize, len, rounded; ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_tmp_cdata != NULL); len = l2hdr->b_asize; cdata = zio_data_buf_alloc(len); csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, cdata, l2hdr->b_asize); if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); l2hdr->b_compress = ZIO_COMPRESS_EMPTY; l2hdr->b_asize = 0; l2hdr->b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } rounded = P2ROUNDUP(csize, (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); if (rounded < len) { /* * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ if (rounded > csize) { bzero((char *)cdata + csize, rounded - csize); csize = rounded; } l2hdr->b_compress = ZIO_COMPRESS_LZ4; l2hdr->b_asize = csize; l2hdr->b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { /* * Compression failed, release the compressed buffer. * l2hdr will be left unmodified. */ zio_data_buf_free(cdata, len); ARCSTAT_BUMP(arcstat_l2_compress_failures); return (B_FALSE); } } /* * Decompresses a zio read back from an l2arc device. On success, the * underlying zio's io_data buffer is overwritten by the uncompressed * version. On decompression error (corrupt compressed stream), the * zio->io_error value is set to signal an I/O error. * * Please note that the compressed data stream is not checksummed, so * if the underlying device is experiencing data corruption, we may feed * corrupt data to the decompressor, so the decompressor needs to be * able to handle this situation (LZ4 does). */ static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) { ASSERT(L2ARC_IS_VALID_COMPRESS(c)); if (zio->io_error != 0) { /* * An io error has occured, just restore the original io * size in preparation for a main pool read. */ zio->io_orig_size = zio->io_size = hdr->b_size; return; } if (c == ZIO_COMPRESS_EMPTY) { /* * An empty buffer results in a null zio, which means we * need to fill its io_data after we're done restoring the * buffer's contents. */ ASSERT(hdr->b_buf != NULL); bzero(hdr->b_buf->b_data, hdr->b_size); zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; } else { ASSERT(zio->io_data != NULL); /* * We copy the compressed data from the start of the arc buffer * (the zio_read will have pulled in only what we need, the * rest is garbage which we will overwrite at decompression) * and then decompress back to the ARC data buffer. This way we * can minimize copying by simply decompressing back over the * original compressed data (rather than decompressing to an * aux buffer and then copying back the uncompressed buffer, * which is likely to be much larger). */ uint64_t csize; void *cdata; csize = zio->io_size; cdata = zio_data_buf_alloc(csize); bcopy(zio->io_data, cdata, csize); if (zio_decompress_data(c, cdata, zio->io_data, csize, hdr->b_size) != 0) zio->io_error = EIO; zio_data_buf_free(cdata, csize); } /* Restore the expected uncompressed IO size. */ zio->io_orig_size = zio->io_size = hdr->b_size; } /* * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. * This buffer serves as a temporary holder of compressed data while * the buffer entry is being written to an l2arc device. Once that is * done, we can dispose of it. */ static void -l2arc_release_cdata_buf(arc_buf_hdr_t *ab) +l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)); if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); + zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size); l2hdr->b_tmp_cdata = NULL; } else { ASSERT(l2hdr->b_tmp_cdata == NULL); } } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void *dummy __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT(spa != NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); vdev_ashift_optimize(vd); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; /* * This is a list of all ARC buffers that are still valid on the * device. */ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT(remdev != NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(remdev->l2ad_buflist); kmem_free(remdev->l2ad_buflist, sizeof (list_t)); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 275811) @@ -1,2836 +1,2837 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); return (crc); } #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = dn->dn_objset; uint64_t obj = dn->dn_object; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; /* * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } static arc_evict_func_t dbuf_do_evict; static void dbuf_evict_user(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level != 0 || db->db_evict_func == NULL) return; db->db_evict_func(&db->db, db->db_user_ptr); db->db_user_ptr = NULL; db->db_evict_func = NULL; } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } void dbuf_evict(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); dbuf_clear(db); dbuf_destroy(db); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_cache); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } } DB_DNODE_EXIT(db); } #endif static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); db->db_buf = buf; if (buf != NULL) { ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; if (!arc_released(buf)) arc_set_callback(buf, dbuf_do_evict, db); } else { dbuf_evict_user(db); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); dbuf_set_data(db, NULL); mutex_exit(&db->db_mtx); } return (abuf); } uint64_t dbuf_whichblock(dnode_t *dn, uint64_t offset) { if (dn->dn_datablkshift) { return (offset >> dn->dn_datablkshift); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db)); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; zbookmark_phys_t zb; - uint32_t aflags = ARC_NOWAIT; + arc_flags_t aflags = ARC_FLAG_NOWAIT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_CACHED) + if (aflags & ARC_FLAG_CACHED) *flags |= DB_RF_CACHED; } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t havepzio = (zio != NULL); boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } ASSERT(err || havepzio || db->db_state == DB_CACHED); return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } /* * This is our just-in-time copy function. It makes a copy of * buffers, that have been modified in a previous transaction * group, before we modify them in the current active group. * * This function is used in two places: when we are dirtying a * buffer for the first time in a txg, and when we are freeing * a range in a dnode that includes this buffer. * * Note that when we are called from dbuf_free_range() we do * not put a hold on the buffer, we just traverse the active * dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_set_data(db, NULL); } } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next, db_search; uint64_t txg = tx->tx_txg; avl_index_t where; if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search.db_level = 0; db_search.db_blkid = start_blkid; db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); if (start_blkid >= dn->dn_unlisted_l0_blkid) { /* There can't be any dbufs in this range; no need to search. */ #ifdef DEBUG db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); ASSERT(db == NULL || db->db_level > 0); #endif mutex_exit(&dn->dn_dbufs_mtx); return; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in * the range to be freed, because receive modifies each * block at most once, and in offset order. If this is * not the case, it can lead to performance problems, * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_clear(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. * * This logic ensures that only block births for * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_last_dirty && (db->db_blkptr == NULL || !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; } /* * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (B_FALSE); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); VERIFY(arc_buf_remove_ref(obuf, db)); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dnode_willuse_space(dn, size-osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much * space it will use on disk yet. We should * really have the struct_rwlock to access * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); /* * Any space we accounted for in dp_dirty_* will be cleaned up by * dsl_pool_sync(). This is relatively rare so the discrepancy * is not a big deal. */ *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level+1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } if (db->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); ASSERT(arc_buf_size(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; VERIFY(arc_buf_remove_ref(db->db_buf, db)); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); VERIFY(arc_buf_remove_ref(db->db_buf, db)); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunately, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_clear_callback() * ARC: dbuf_do_evict()->dbuf_destroy() * * This routine will dissociate the dbuf from the arc, by calling * arc_clear_callback(), but will not evict the data from the ARC. */ void dbuf_clear(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); dbuf_evict_user(db); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } else { DB_DNODE_EXIT(db); } if (db->db_buf) dbuf_gone = arc_clear_callback(db->db_buf); if (!dbuf_gone) mutex_exit(&db->db_mtx); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { int nlevels, epbs; *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = &dn->dn_phys->dn_spill; else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; else nlevels = dn->dn_phys->dn_nlevels; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user_ptr = NULL; db->db_evict_func = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } static int dbuf_do_evict(void *private) { dmu_buf_impl_t *db = private; if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; dbuf_evict(db); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } return (0); } static void dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ if (db->db_dnode_handle != NULL) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold * corresponding to the removed dbuf is no longer * discounted in dnode_move(), so the dnode cannot be * moved until after we release the hold. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; db->db_buf = NULL; ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (dnode_block_freed(dn, blkid)) return; /* dbuf_find() returns with db_mtx held */ if (db = dbuf_find(dn, 0, blkid)) { /* * This dbuf is already in the cache. We assume that * it is already CACHED, or else about to be either * read or filled. */ mutex_exit(&db->db_mtx); return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = + ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); (void) arc_read(NULL, dn->dn_objset->os_spa, bp, NULL, NULL, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } if (db) dbuf_rele(db, NULL); } } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { dbuf_clear(db); if (parent) { dbuf_rele(parent, NULL); parent = NULL; } goto top; } ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); return (err ? NULL : db); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); ASSERT(holds > 1); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); /* * If the dnode moves here, we cannot cross this barrier * until the move completes. */ DB_DNODE_ENTER(db); atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); DB_DNODE_EXIT(db); /* * The bonus buffer's dnode hold is no longer discounted * in dnode_move(). The dnode cannot move until after * the dnode_rele(). */ dnode_rele(DB_DNODE(db), db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_evict(db); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { VERIFY(!arc_buf_remove_ref(db->db_buf, db)); /* * A dbuf will be eligible for eviction if either the * 'primarycache' property is set or a duplicate * copy of this buffer is already cached in the arc. * * In the case of the 'primarycache' a buffer * is considered for eviction if it matches the * criteria set in the property. * * To decide if our buffer is considered a * duplicate, we must call into the arc to determine * if multiple buffers are referencing the same * block on-disk. If so, then we simply evict * ourselves. */ if (!DBUF_IS_CACHEABLE(db)) { if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { spa_t *spa = dmu_objset_spa(db->db_objset); blkptr_t bp = *db->db_blkptr; dbuf_clear(db); arc_freed(spa, &bp); } else { dbuf_clear(db); } } else if (arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); } } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, dmu_buf_evict_func_t *evict_func) { return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, dmu_buf_evict_func_t *evict_func) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_immediate_evict = TRUE; return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); } void * dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, dmu_buf_evict_func_t *evict_func) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_level == 0); ASSERT((user_ptr == NULL) == (evict_func == NULL)); mutex_enter(&db->db_mtx); if (db->db_user_ptr == old_user_ptr) { db->db_user_ptr = user_ptr; db->db_evict_func = evict_func; } else { old_user_ptr = db->db_user_ptr; } mutex_exit(&db->db_mtx); return (old_user_ptr); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(!refcount_is_zero(&db->db_holds)); return (db->db_user_ptr); } boolean_t dmu_buf_freeable(dmu_buf_t *dbuf) { boolean_t res = B_FALSE; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, db->db_blkptr, db->db_blkptr->blk_birth); return (res); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = &dn->dn_phys->dn_spill; BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); (void) dbuf_hold_impl(dn, db->db_level+1, db->db_blkid >> epbs, FALSE, db, &parent); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { dnode_phys_t *dnp = db->db.db_data; for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) fill++; } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ void *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c (revision 275811) @@ -1,251 +1,251 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct diffarg { struct file *da_fp; /* file to which we are reporting */ offset_t *da_offp; int da_err; /* error that stopped diff search */ dmu_diff_record_t da_ddr; kthread_t *da_td; }; static int write_bytes(struct diffarg *da) { struct uio auio; struct iovec aiov; aiov.iov_base = (caddr_t)&da->da_ddr; aiov.iov_len = sizeof (da->da_ddr); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = aiov.iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = (off_t)-1; auio.uio_td = da->da_td; #ifdef _KERNEL if (da->da_fp->f_type == DTYPE_VNODE) bwillwrite(); return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); return (EOPNOTSUPP); #endif } static int write_record(struct diffarg *da) { if (da->da_ddr.ddr_type == DDR_NONE) { da->da_err = 0; return (0); } da->da_err = write_bytes(da); *da->da_offp += sizeof (da->da_ddr); return (da->da_err); } static int report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) { ASSERT(first <= last); if (da->da_ddr.ddr_type != DDR_FREE || first != da->da_ddr.ddr_last + 1) { if (write_record(da) != 0) return (da->da_err); da->da_ddr.ddr_type = DDR_FREE; da->da_ddr.ddr_first = first; da->da_ddr.ddr_last = last; return (0); } da->da_ddr.ddr_last = last; return (0); } static int report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) { ASSERT(dnp != NULL); if (dnp->dn_type == DMU_OT_NONE) return (report_free_dnode_range(da, object, object)); if (da->da_ddr.ddr_type != DDR_INUSE || object != da->da_ddr.ddr_last + 1) { if (write_record(da) != 0) return (da->da_err); da->da_ddr.ddr_type = DDR_INUSE; da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; return (0); } da->da_ddr.ddr_last = object; return (0); } #define DBP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) /* ARGSUSED */ static int diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct diffarg *da = arg; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); if (zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { uint64_t span = DBP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = report_free_dnode_range(da, dnobj, dnobj + (span >> DNODE_SHIFT) - 1); if (err) return (err); } else if (zb->zb_level == 0) { dnode_phys_t *blk; arc_buf_t *abuf; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; int blksz = BP_GET_LSIZE(bp); int i; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = report_dnode(da, dnobj, blk+i); if (err) break; } (void) arc_buf_remove_ref(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ return (TRAVERSE_VISIT_NO_CHILDREN); } return (0); } int dmu_diff(const char *tosnap_name, const char *fromsnap_name, #ifdef illumos struct vnode *vp, offset_t *offp) #else struct file *fp, offset_t *offp) #endif { struct diffarg da; dsl_dataset_t *fromsnap; dsl_dataset_t *tosnap; dsl_pool_t *dp; int error; uint64_t fromtxg; if (strchr(tosnap_name, '@') == NULL || strchr(fromsnap_name, '@') == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(tosnap_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EXDEV)); } fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_long_hold(tosnap, FTAG); dsl_pool_rele(dp, FTAG); da.da_fp = fp; da.da_offp = offp; da.da_ddr.ddr_type = DDR_NONE; da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; da.da_err = 0; da.da_td = curthread; error = traverse_dataset(tosnap, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); if (error != 0) { da.da_err = error; } else { /* we set the da.da_err we return as side-effect */ (void) write_record(&da); } dsl_dataset_long_rele(tosnap, FTAG); dsl_dataset_rele(tosnap, FTAG); return (da.da_err); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 275811) @@ -1,1829 +1,1829 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST); os->os_redundant_metadata = newval; } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size == sizeof (objset_phys_t)) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } } if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_LZJB; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } if (ds == NULL || !dsl_dataset_is_snapshot(ds)) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); DMU_META_DNODE(os) = dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { DMU_USERUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_from_ds(ds, osp); dsl_pool_rele(dp, FTAG); if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); dsl_pool_rele(dp, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(objset_t *os, void *tag) { dsl_pool_t *dp; dsl_dataset_t *ds, *newds; char name[MAXNAMELEN]; ds = os->os_dsl_dataset; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = dmu_objset_pool(os); dsl_pool_config_enter(dp, FTAG); dmu_objset_disown(os, tag); VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); VERIFY3P(newds, ==, os->os_dsl_dataset); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, void *tag) { dsl_dataset_disown(os->os_dsl_dataset, tag); } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ list_remove(&os->os_dnodes, DMU_META_DNODE(os)); list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance * because dnode_add_ref() only works if you already have a * hold. If there are no holds then it has no dbufs so OK to * skip. */ for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = next_dn; } mutex_exit(&os->os_lock); } void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os)); } VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { int levels = 1; /* * Determine the number of levels necessary for the meta-dnode * to contain DN_MAX_OBJECT dnodes. */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT * sizeof (dnode_phys_t)) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; if (dmu_objset_userused_enabled(os)) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; } dmu_objset_create_arg_t; /*ARGSUSED*/ static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, ds, bp, doca->doca_type, tx); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dmu_objset_create_arg_t doca; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; return (dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; } dmu_objset_clone_arg_t; /*ARGSUSED*/ static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } /* You can't clone across pools. */ if (pdd->dd_pool != dp) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EXDEV)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) return (error); /* You can't clone across pools. */ if (origin->ds_dir->dd_pool != dp) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EXDEV)); } /* You can only clone snapshots, not the head datasets. */ if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[MAXNAMELEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); return (dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; while (dn = list_head(list)) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); if (newlist) { (void) dnode_add_ref(dn, newlist); list_insert_tail(newlist, dn); } dnode_sync(dn, tx); } } /* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); } /* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; if (dmu_objset_userused_enabled(os)) { newlist = &os->os_synced_dnodes; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. */ list_create(newlist, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff])); } dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) { used_cbs[ost] = cb; } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && used_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } static void do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, user, delta, tx)); VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, group, delta, tx)); } } void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY(0 == zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } /* * We intentionally modify the zap object even if the * net delta is zero. Otherwise * the block of the zap obj could be shared between * datasets but need to be different between them after * a bprewrite. */ flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); dnode_rele(dn, list); } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr, **drp; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) break; if (dr == NULL) { data = NULL; } else { dnode_t *dn; DB_DNODE_ENTER(dr->dr_dbuf); dn = DB_DNODE(dr->dr_dbuf); if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; DB_DNODE_EXIT(dr->dr_dbuf); } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; uint64_t *user = NULL; uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } if (before) { ASSERT(data); user = &dn->dn_olduid; group = &dn->dn_oldgid; } else if (data) { user = &dn->dn_newuid; group = &dn->dn_newgid; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, user, group); /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } int dmu_objset_userspace_upgrade(objset_t *os) { uint64_t obj; int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } /* * Find objsets under and including ddobj, call func(ds) on each. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err; ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err != 0) return (err); /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dmu_objset_find_dp(dp, attr->za_first_integer, func, arg, flags); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) return (err); /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) return (err); err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); return (err); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least MAXNAMELEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 275811) @@ -1,2225 +1,2225 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #undef dump_write #define dump_write dmu_dump_write #endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; struct uio auio; struct iovec aiov; ASSERT0(len % 8); fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = (off_t)-1; auio.uio_td = dsp->dsa_td; #ifdef _KERNEL if (dsp->dsa_fp->f_type == DTYPE_VNODE) bwillwrite(); dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, dsp->dsa_td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); dsp->dsa_err = EOPNOTSUPP; #endif mutex_enter(&ds->ds_sendstream_lock); *dsp->dsa_off += len; mutex_exit(&ds->ds_sendstream_lock); return (dsp->dsa_err); } static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object+offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); /* * If we are doing a non-incremental send, then there can't * be any data in the dataset we're receiving into. Therefore * a free record would simply be a no-op. Save space by not * sending it to begin with. */ if (!dsp->dsa_incremental) return (0); if (length != -1ULL && offset + length < offset) length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this * function is called with a -1, and only after flushing * any pending record). */ ASSERT(length != -1ULL); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a DATA record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block * writes or embedded BP's, so (like * fletcher4-checkummed blocks) userland will have to * compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); dsp->dsa_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, data, blksz)) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* See comment in dump_free(). */ if (!dsp->dsa_incremental) return (0); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dsp->dsa_toguid; dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } #define BP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { dmu_sendarg_t *dsp = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (zb->zb_level == ZB_ZIL_LEVEL) { /* * If we are sending a non-snapshot (which is allowed on * read-only pools), it may have a ZIL, which must be ignored. */ return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dnp, zb->zb_level); err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(dsp, dnobj, blk+i); if (err != 0) break; } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } else if (backup_do_embed(dsp, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; err = dump_write_embedded(dsp, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); uint64_t offset; ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) *ptr = 0x2f5baddb10c; } else { return (SET_ERROR(EIO)); } } offset = zb->zb_blkid * blksz; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsp, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { err = dump_write(dsp, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } /* * Releases dp using the specified tag. */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, #ifdef illumos boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) #else boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off) #endif { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; err = dmu_objset_from_ds(ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif if (large_block_ok && ds->ds_large_blocks) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } else { embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = dsl_dataset_phys(ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (fromzb != NULL) { drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); if (!dsl_dataset_is_snapshot(ds)) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); dsp->dsa_drr = drr; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; dsp->dsa_td = curthread; dsp->dsa_fp = fp; dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); dsp->dsa_featureflags = featureflags; mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, tag); if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { err = dsp->dsa_err; goto out; } err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, dsp); if (dsp->dsa_pending_op != PENDING_NONE) if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { err = dsp->dsa_err; goto out; } out: mutex_enter(&ds->ds_sendstream_lock); list_remove(&ds->ds_sendstreams, dsp); mutex_exit(&ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); dsl_dataset_long_rele(ds, FTAG); return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { zfs_bookmark_phys_t zb; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, outfd, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, outfd, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; int err; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ err = dsl_dataset_own(dp, tosnap, FTAG, &ds); owned = B_TRUE; } else { err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t zb; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { is_clone = B_TRUE; } if (strchr(fromsnap, '@')) { dsl_dataset_t *fromds; err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); if (err == 0) { if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, outfd, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, outfd, fp, off); } if (owned) dsl_dataset_disown(ds, FTAG); else dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; uint64_t size; ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); /* Get uncompressed size estimate of changed data. */ if (fromds == NULL) { size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; } else { uint64_t used, comp; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &size); if (err != 0) return (err); } /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume all blocks are recordsize. Assume ditto blocks and * internal fragmentation counter out compression. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block, which we observe in practice. */ uint64_t recordsize; err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); if (err != 0) return (err); size -= size / recordsize * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += size / recordsize * sizeof (dmu_replay_record_t); *sizep = size; return (0); } typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; uint64_t drba_snapobj; } dmu_recv_begin_arg_t; static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid) { uint64_t val; int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); /* new snapshot name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) return (error); if (fromguid != 0) { dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_snapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. */ if (dsl_dataset_modified_since_snap(ds, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_snapobj = ds->ds_prev->ds_object; } dsl_dataset_rele(snap, FTAG); } else { /* if full, most recent snapshot must be $ORIGIN */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL) return (SET_ERROR(ENODEV)); drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS * feature enabled if the stream has LARGE_BLOCKS. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid); dsl_dataset_rele(ds, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[MAXNAMELEN]; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, MAXNAMELEN); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } dsl_dataset_rele(origin, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; uint64_t crflags; crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? DS_FLAG_CI_DATASET : 0; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* create temporary clone */ dsl_dataset_t *snap = NULL; if (drba->drba_snapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_snapobj, FTAG, &snap)); } dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, tx); dsl_dataset_rele(snap, FTAG); dsl_dataset_rele(ds, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drba->drba_cookie->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !newds->ds_large_blocks) { dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); newds->ds_large_blocks = B_TRUE; } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } drba->drba_cookie->drc_ds = newds; spa_history_log_internal_ds(newds, "receive", tx, ""); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, boolean_t force, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_cred = CRED(); if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) drc->drc_byteswap = B_TRUE; else if (drrb->drr_magic != DMU_BACKUP_MAGIC) return (SET_ERROR(EINVAL)); drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin = *drc->drc_drrb; if (drc->drc_byteswap) { fletcher_4_incremental_byteswap(drr, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { fletcher_4_incremental_native(drr, sizeof (dmu_replay_record_t), &drc->drc_cksum); } kmem_free(drr, sizeof (dmu_replay_record_t)); if (drc->drc_byteswap) { drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } struct restorearg { int err; boolean_t byteswap; kthread_t *td; struct file *fp; char *buf; uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; avl_tree_t *guid_to_ds_map; }; typedef struct guid_map_entry { uint64_t guid; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; static int guid_compare(const void *arg1, const void *arg2) { const guid_map_entry_t *gmep1 = arg1; const guid_map_entry_t *gmep2 = arg2; if (gmep1->guid < gmep2->guid) return (-1); else if (gmep1->guid > gmep2->guid) return (1); return (0); } static void free_guid_map_onexit(void *arg) { avl_tree_t *ca = arg; void *cookie = NULL; guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { dsl_dataset_long_rele(gmep->gme_ds, gmep); dsl_dataset_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); kmem_free(ca, sizeof (avl_tree_t)); } static int restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; int error; aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = off; auio.uio_td = ra->td; #ifdef _KERNEL error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); error = EOPNOTSUPP; #endif *resid = auio.uio_resid; return (error); } static void * restore_read(struct restorearg *ra, int len, char *buf) { int done = 0; if (buf == NULL) buf = ra->buf; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; ra->err = restore_bytes(ra, buf + done, len - done, ra->voff, &resid); if (resid == len - done) ra->err = SET_ERROR(EINVAL); ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (NULL); } ASSERT3U(done, ==, len); if (ra->byteswap) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); else fletcher_4_incremental_native(buf, len, &ra->cksum); return (buf); } static void backup_byteswap(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_length); DO64(drr_write.drr_toguid); DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write.drr_key.ddk_prop); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); DO64(drr_write_byref.drr_offset); DO64(drr_write_byref.drr_length); DO64(drr_write_byref.drr_toguid); DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); DO64(drr_end.drr_checksum.zc_word[2]); DO64(drr_end.drr_checksum.zc_word[3]); DO64(drr_end.drr_toguid); break; } #undef DO64 #undef DO32 } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); } } static int restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) { dmu_object_info_t doi; dmu_tx_t *tx; void *data = NULL; uint64_t object; int err; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); if (ra->err != 0) return (ra->err); } /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. */ if (err == 0) { int nblkptr; nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { err = dmu_free_long_range(os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ err = dmu_object_claim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ err = dmu_object_reclaim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); } /* ARGSUSED */ static int restore_freeobjects(struct restorearg *ra, objset_t *os, struct drr_freeobjects *drrfo) { uint64_t obj; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { int err; if (dmu_object_info(os, obj, NULL) != 0) continue; err = dmu_free_long_object(os, obj); if (err != 0) return (err); } return (0); } static int restore_write(struct restorearg *ra, objset_t *os, struct drr_write *drrw) { dmu_tx_t *tx; void *data; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); if (dmu_object_info(os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); dmu_buf_t *bonus; if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); data = restore_read(ra, drrw->drr_length, abuf->b_data); if (data == NULL) { dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); return (ra->err); } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); return (err); } if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); } dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); return (0); } /* * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed * streams to refer to a copy of the data that is already on the * system because it came in earlier in the stream. This function * finds the earlier copy of the data, and uses that copy instead of * data from the stream to fulfill this write. */ static int restore_write_byref(struct restorearg *ra, objset_t *os, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); /* * If the GUID of the referenced dataset is different from the * GUID of the target dataset, find the referenced dataset. */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { ref_os = os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write(os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); } static int restore_write_embedded(struct restorearg *ra, objset_t *os, struct drr_write_embedded *drrwnp) { dmu_tx_t *tx; int err; void *data; if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) return (EINVAL); if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); if (data == NULL) return (ra->err); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) { dmu_tx_t *tx; void *data; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) return (SET_ERROR(EINVAL)); data = restore_read(ra, drrs->drr_length, NULL); if (data == NULL) return (ra->err); if (dmu_object_info(os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } tx = dmu_tx_create(os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } dmu_buf_will_dirty(db_spill, tx); if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); bcopy(data, db_spill->db_data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } /* ARGSUSED */ static int restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { char name[MAXNAMELEN]; dsl_dataset_name(drc->drc_ds, name); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); (void) dsl_destroy_head(name); } /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { struct restorearg ra = { 0 }; dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; int featureflags; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.td = curthread; ra.fp = fp; ra.voff = *voffp; ra.bufsize = SPA_MAXBLOCKSIZE; ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { ra.err = SET_ERROR(EBADF); goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); if (ra.err != 0) { cleanup_fd = -1; goto out; } if (*action_handlep == 0) { ra.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); avl_create(ra.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { ra.err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); if (ra.err != 0) goto out; } drc->drc_guid_to_ds_map = ra.guid_to_ds_map; } /* * Read records and process them. */ pcksum = ra.cksum; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = SET_ERROR(EINTR); goto out; } if (ra.byteswap) backup_byteswap(drr); switch (drr->drr_type) { case DRR_OBJECT: { /* * We need to make a copy of the record header, * because restore_{object,write} may need to * restore_read(), which will invalidate drr. */ struct drr_object drro = drr->drr_u.drr_object; ra.err = restore_object(&ra, os, &drro); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects drrfo = drr->drr_u.drr_freeobjects; ra.err = restore_freeobjects(&ra, os, &drrfo); break; } case DRR_WRITE: { struct drr_write drrw = drr->drr_u.drr_write; ra.err = restore_write(&ra, os, &drrw); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwbr = drr->drr_u.drr_write_byref; ra.err = restore_write_byref(&ra, os, &drrwbr); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded drrwe = drr->drr_u.drr_write_embedded; ra.err = restore_write_embedded(&ra, os, &drrwe); break; } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; ra.err = restore_free(&ra, os, &drrf); break; } case DRR_END: { struct drr_end drre = drr->drr_u.drr_end; /* * We compare against the *previous* checksum * value, because the stored checksum is of * everything before the DRR_END record. */ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ra.err = SET_ERROR(ECKSUM); goto out; } case DRR_SPILL: { struct drr_spill drrs = drr->drr_u.drr_spill; ra.err = restore_spill(&ra, os, &drrs); break; } default: ra.err = SET_ERROR(EINVAL); goto out; } pcksum = ra.cksum; } ASSERT(ra.err != 0); out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (ra.err != 0) { /* * destroy what we created, so we don't leave it in the * inconsistent restoring state. */ dmu_recv_cleanup_ds(drc); } kmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; return (ra.err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (error); if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, * we can evict its bonus buffer. */ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); drc->drc_ds = NULL; } static int add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); if (err == 0) { gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); dsl_dataset_long_hold(snapds, gmep); } else kmem_free(gmep, sizeof (*gmep)); dsl_pool_rele(dp, FTAG); return (err); } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { int error; char name[MAXNAMELEN]; #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) dmu_recv_cleanup_ds(drc); return (error); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { int error; error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) { dmu_recv_cleanup_ds(drc); } else if (drc->drc_guid_to_ds_map != NULL) { (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, drc->drc_newsnapobj); } return (error); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { drc->drc_owner = owner; if (drc->drc_newfs) return (dmu_recv_new_end(drc)); else return (dmu_recv_existing_end(drc)); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c (revision 275811) @@ -1,648 +1,648 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include int zfs_pd_blks_max = 100; typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; int pd_blks_max; int pd_blks_fetched; int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; } prefetch_data_t; typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; zbookmark_phys_t *td_resume; int td_flags; prefetch_data_t *td_pfd; boolean_t td_paused; uint64_t td_hole_birth_enabled_txg; blkptr_cb_t *td_func; void *td_arg; } traverse_data_t; static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) return (0); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); return (0); } static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); } return (0); } static void traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed; plus, in read-only mode, blocks that are already stable. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg); zil_free(zilog); } typedef enum resume_skip { RESUME_SKIP_ALL, RESUME_SKIP_NONE, RESUME_SKIP_CHILDREN } resume_skip_t; /* * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and * the block indicated by zb does not need to be visited at all. Returns * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the * resume point. This indicates that this block should be visited but not its * children (since they must have been visited in a previous traversal). * Otherwise returns RESUME_SKIP_NONE. */ static resume_skip_t resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { /* * If we already visited this bp & everything below, * don't bother doing it again. */ if (zbookmark_is_before(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); /* * If we found the block we're trying to resume from, zero * the bookmark out to indicate that we have resumed. */ if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { bzero(td->td_resume, sizeof (*zb)); if (td->td_flags & TRAVERSE_POST) return (RESUME_SKIP_CHILDREN); } } return (RESUME_SKIP_NONE); } static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) return; if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); } static boolean_t prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) { ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (B_FALSE); return (B_TRUE); } static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { zbookmark_phys_t czb; int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (bp->blk_birth == 0) { /* * Since this block has a birth time of 0 it must be a * hole created before the SPA_FEATURE_HOLE_BIRTH * feature was enabled. If SPA_FEATURE_HOLE_BIRTH * was enabled before the min_txg for this traveral we * know the hole must have been created before the * min_txg for this traveral, so we can skip it. If * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg * for this traveral we cannot tell if the hole was * created before or after the min_txg for this * traversal, so we cannot skip it. */ if (td->td_hole_birth_enabled_txg < td->td_min_txg) return (0); } else if (bp->blk_birth <= td->td_min_txg) { return (0); } if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_blks_fetched >= 0); while (pd->pd_blks_fetched == 0 && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_blks_fetched--; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; return (0); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; cbp = buf->b_data; for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; dnp = buf->b_data; for (i = 0; i < epb; i++) { prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; dnode_phys_t *dnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; osp = buf->b_data; dnp = &osp->os_meta_dnode; prefetch_dnode_metadata(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { prefetch_dnode_metadata(td, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); prefetch_dnode_metadata(td, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } err = traverse_dnode(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_groupused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_userused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, DMU_USERUSED_OBJECT); } } if (buf) (void) arc_buf_remove_ref(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (hard && (err == EIO || err == ECKSUM)) { /* * Ignore this disk error as requested by the HARD flag, * and continue traversal. */ err = 0; } /* * If we are stopping here, set td_resume. */ if (td->td_resume != NULL && err != 0 && !td->td_paused) { td->td_resume->zb_objset = zb->zb_objset; td->td_resume->zb_object = zb->zb_object; td->td_resume->zb_level = 0; /* * If we have stopped on an indirect block (e.g. due to * i/o error), we have not visited anything below it. * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. */ td->td_resume->zb_blkid = zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); td->td_paused = B_TRUE; } return (err); } static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j; zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); } } static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); if (err != 0) break; } if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); } return (err); } /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_blks_fetched >= 0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); if (!prefetch_needed(pfd, bp)) return (0); mutex_enter(&pfd->pd_mtx); while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) cv_wait(&pfd->pd_cv, &pfd->pd_mtx); pfd->pd_blks_fetched++; cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); return (0); } static void traverse_prefetch_thread(void *arg) { traverse_data_t *td_main = arg; traverse_data_t td = *td_main; zbookmark_phys_t czb; td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); td_main->td_pfd->pd_exited = B_TRUE; cv_broadcast(&td_main->td_pfd->pd_cv); mutex_exit(&td_main->td_pfd->pd_mtx); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ static int traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { traverse_data_t td; prefetch_data_t pd = { 0 }; zbookmark_phys_t czb; int err; ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); /* * The data prefetching mechanism (the prefetch thread) is incompatible * with resuming from a bookmark. */ ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; td.td_min_txg = txg_start; td.td_resume = resume; td.td_func = func; td.td_arg = arg; td.td_pfd = &pd; td.td_flags = flags; td.td_paused = B_FALSE; if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { VERIFY(spa_feature_enabled_txg(spa, SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); } else { td.td_hole_birth_enabled_txg = 0; } pd.pd_blks_max = zfs_pd_blks_max; pd.pd_flags = flags; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, td.td_spa, rootbp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); if (err != 0) return (err); osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); (void) arc_buf_remove_ref(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); pd.pd_cancel = B_TRUE; cv_broadcast(&pd.pd_cv); while (!pd.pd_exited) cv_wait(&pd.pd_cv, &pd.pd_mtx); mutex_exit(&pd.pd_mtx); mutex_destroy(&pd.pd_mtx); cv_destroy(&pd.pd_cv); return (err); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); } int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, blkptr, txg_start, resume, flags, func, arg)); } /* * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { int err; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), txg_start, NULL, flags, func, arg); if (err != 0) return (err); /* visit each dataset */ for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); if (err != 0) { if (hard) continue; break; } if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; uint64_t txg = txg_start; dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); dsl_pool_config_exit(dp, FTAG); if (err != 0) { if (hard) continue; break; } if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } } if (err == ESRCH) err = 0; return (err); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 275811) @@ -1,1835 +1,1835 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ SYSCTL_DECL(_vfs_zfs); SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, &zfs_no_scrub_io, 0, "Disable scrub I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ uint64_t zfs_free_max_blocks = UINT64_MAX; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) extern int zfs_txg_timeout; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); if (scn->scn_phys.scn_state == DSS_SCANNING && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", scn->scn_restart_txg); } } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan) { kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); return (0); } static void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } else { spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); dsl_scan_sync_state(scn, tx); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY(0 == dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (complete) scn->scn_phys.scn_state = DSS_FINISHED; else scn->scn_phys.scn_state = DSS_CANCELED; spa_history_log_internal(spa, "scan done", tx, "complete=%u", complete); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); } spa_errlog_rotate(spa); /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } scn->scn_phys.scn_end_time = gethrestime_sec(); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state != DSS_SCANNING) return (SET_ERROR(ENOENT)); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); } int dsl_scan_cancel(dsl_pool_t *dp) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), pio->io_flags)); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (dsl_dataset_is_snapshot(ds)) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); } extern int zfs_vdev_async_write_active_min_dirty_percent; static boolean_t dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); /* * We pause if: * - we have scanned for the maximum time: an entire txg * timeout (default 5 sec) * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), or someone is explicitly waiting for this txg * to complete. * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. */ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scan_min_time_ms; uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > mintime && (txg_sync_waiting(scn->scn_dp) || dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_pausing = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); } /* ARGSUSED */ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) return; SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for pausing * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group accounting * objects, and never skip them, even if we are * pausing. This is necessary so that the space * deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } (void) arc_buf_remove_ref(buf, &buf); } return (0); } static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(&dnp->dn_spill, &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; blkptr_t bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_pause(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; if (BP_IS_HOLE(bp)) return; scn->scn_visited_this_txg++; dprintf_bp(bp, "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", ds, ds ? ds->ds_object : 0, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) return; /* * If dsl_scan_ddt() has aready visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { if (dsl_dataset_is_snapshot(ds)) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn->scn_phys.scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (dsl_dataset_is_snapshot(ds)) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } else { zfs_dbgmsg("destroying ds %llu; ignoring", (u_longlong_t)ds->ds_object); } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); err = zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); VERIFY(err == 0 || err == EEXIST); if (err == EEXIST) { /* Both were there to begin with */ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); } zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx); } struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; }; /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (dmu_objset_from_ds(ds, &os)) goto out; /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. So we traverse the * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) dsl_scan_zil(dp, &os->os_zil_header); /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_pausing); kmem_free(dsname, ZFS_MAXNAMELEN); if (scn->scn_pausing) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, scn->scn_phys.scn_cur_max_txg, tx) == 0); goto out; } /* * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { VERIFY0(zap_join_key(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde = { 0 }; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_pause(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_pausing); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; if (scn->scn_phys.scn_state != DSS_SCANNING) return; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; zap_cursor_t zc; zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_pausing) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_pausing) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_pausing); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { /* * If we were paused, continue from here. Note if the * ds we were paused on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); if (scn->scn_pausing) return; } /* * In case we were paused right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj), zap_cursor_retrieve(&zc, &za) == 0) { dsl_dataset_t *ds; uint64_t dsobj; dsobj = strtonum(za.za_name, NULL); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsobj, tx)); /* Set up min/max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (za.za_first_integer != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, za.za_first_integer); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); zap_cursor_fini(&zc); if (scn->scn_pausing) return; } zap_cursor_fini(&zc); } static boolean_t dsl_scan_free_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (scn->scn_visited_this_txg >= zfs_free_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_free_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0); } void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). */ if (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * If the scan is inactive due to a stalled async destroy, try again. */ if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) || spa_sync_pass(dp->dp_spa) > 1) return; scn->scn_visited_this_txg = 0; scn->scn_pausing = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we pause, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu; err=%d", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; /* * Write out changes to the DDT that may be required as a * result of the blocks freed. This ensures that the DDT * is clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); } if (err != 0) return; if (!scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (!scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_done_txg == tx->tx_txg) { ASSERT(!scn->scn_pausing); /* finished with scan. */ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); dsl_scan_done(scn, B_TRUE, tx); ASSERT3U(spa->spa_scrub_inflight, ==, 0); dsl_scan_sync_state(scn, tx); return; } if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); } else { zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, (longlong_t)scn->scn_phys.scn_bookmark.zb_object, (longlong_t)scn->scn_phys.scn_bookmark.zb_level, (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); if (!scn->scn_pausing) { scn->scn_done_txg = tx->tx_txg + 1; zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", tx->tx_txg, scn->scn_done_txg); } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); } dsl_scan_sync_state(scn, tx); } /* * This will start a new scan, or restart an existing one. */ void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", txg); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } /* * scrub consumers */ static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; } mutex_exit(&spa->spa_scrub_lock); } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; unsigned int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); count_block(dp->dp_blkstats, bp); if (BP_IS_EMBEDDED(bp)) return (0); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); /* if it's a resilver, this may not be in the target range */ if (!needs_io) { if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ needs_io = B_TRUE; } else { needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1); } } } if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * MAX(zfs_top_maxinflight, 1); void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); /* * If we're seeing recent (zfs_scan_idle) "important" I/Os * then throttle our workload to limit the impact of a scan. */ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(MAX((int)scan_delay, 0)); zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ return (0); } int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h (revision 275811) @@ -1,146 +1,167 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H #include #ifdef __cplusplus extern "C" { #endif #include #include #include typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); typedef int arc_evict_func_t(void *priv); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_NONE = 1 << 0, /* No flags set */ + ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ + ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ + ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ + ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ + ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ + ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ +} arc_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; }; typedef enum arc_buf_contents { ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; -/* - * These are the flags we pass into calls to the arc - */ -#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ -#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ -#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ -#define ARC_CACHED (1 << 4) /* I/O was already in cache */ -#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */ /* * The following breakdows of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_OTHER, ARC_SPACE_NUMTYPES } arc_space_type_t; void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *priv, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_phys_t *zb); + arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv); boolean_t arc_clear_callback(arc_buf_t *buf); void arc_flush(spa_t *spa); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); void arc_init(void); void arc_fini(void); /* * Level 2 ARC */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); #ifdef illumos #ifndef _KERNEL extern boolean_t arc_watch; extern int arc_procfd; #endif #endif /* illumos */ #ifdef __cplusplus } #endif #endif /* _SYS_ARC_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 275811) @@ -1,2144 +1,2144 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The zfs intent log (ZIL) saves transaction records of system calls * that change the file system in memory with enough information * to be able to replay them. These are stored in memory until * either the DMU transaction group (txg) commits them to the stable pool * and they can be discarded, or they are flushed to the stable log * (also in the pool) due to a fsync, O_DSYNC or other synchronous * requirement. In the event of a panic or power fail then those log * records (transactions) are replayed. * * There is one ZIL per file system. Its on-disk (pool) format consists * of 3 parts: * * - ZIL header * - ZIL blocks * - ZIL records * * A log record holds a system call transaction. Log blocks can * hold many log records and the blocks are chained together. * Each ZIL block contains a block pointer (blkptr_t) to the next * ZIL block in the chain. The ZIL header points to the first * block in the chain. Note there is not a fixed place in the pool * to hold blocks. They are dynamically allocated and freed as * needed from the blocks available. Figure X shows the ZIL structure: */ /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, &zil_replay_disable, 0, "Disable intent logging replay"); /* * Tunable parameter for debugging or performance analysis. Setting * zfs_nocacheflush will cause corruption on power loss if a volatile * out-of-order write cache is enabled. */ boolean_t zfs_nocacheflush = B_FALSE; SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, &zfs_nocacheflush, 0, "Disable cache flush"); boolean_t zfs_trim_enabled = B_TRUE; SYSCTL_DECL(_vfs_zfs_trim); SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, "Enable ZFS TRIM"); static kmem_cache_t *zil_lwb_cache; static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) /* * ziltest is by and large an ugly hack, but very useful in * checking replay without tedious work. * When running ziltest we want to keep all itx's and so maintain * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG * We subtract TXG_CONCURRENT_STATES to allow for common code. */ #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) return (-1); if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) return (1); if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) return (-1); if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) return (1); return (0); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = abuf->b_data; char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { char *lr = abuf->b_data; uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(zilc->zc_nused, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } VERIFY(arc_buf_remove_ref(abuf, &abuf)); } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } return (error); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk; char *lrbuf, *lrp; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *end; if (blk_seq > claim_blk_seq) break; if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); if (error != 0) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) goto done; if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) goto done; ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; if (lrc->lrc_txtype != TX_WRITE) return (0); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg && (error = zil_read_log_data(zilog, lr, NULL)) != 0) return (error); return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } /* ARGSUSED */ static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; /* * If we previously claimed it, we need to free it. */ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static lwb_t * zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; lwb->lwb_tx = NULL; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_nused = sizeof (zil_chain_t); lwb->lwb_sz = BP_GET_LSIZE(bp); } else { lwb->lwb_nused = 0; lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); } mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); return (lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); if (dsl_dataset_is_snapshot(ds)) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); } } boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianess */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free_zil(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, &blk, txg); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block poiner into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); return (lwb); } /* * In one tx, free all log blocks and clear the log header. * If keep_first is set, then we're replaying a log with no content. * We want to keep the first block, however, so that the first * synchronous transaction doesn't require a txg_wait_synced() * in zil_create(). We don't need to txg_wait_synced() here either * when keep_first is set, because both zil_create() and zil_destroy() * will wait for any in-progress destroys to complete. */ void zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return; tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); } int zil_claim(const char *osname, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); zilog_t *zilog; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %s, error %u", osname, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { if (!BP_IS_HOLE(&zh->zh_log)) zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, FTAG); return (0); } /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(const char *osname, void *tx) { zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_hold(osname, FTAG, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the log * as its content should have already been synced to the pool. */ if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) { dmu_objset_rele(os, FTAG); return (0); } } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); dmu_objset_rele(os, FTAG); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } static int zil_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; if (v1 < v2) return (-1); if (v1 > v2) return (1); return (0); } void zil_add_block(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; if (zfs_nocacheflush) return; ASSERT(zilog->zl_writer); /* * Even though we're zl_writer, we still need a lock because the * zl_get_data() callbacks may have dmu_sync() done callbacks * that will run concurrently. */ mutex_enter(&zilog->zl_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&zilog->zl_vdev_lock); } static void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; avl_tree_t *t = &zilog->zl_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; zio_t *zio; ASSERT(zilog->zl_writer); /* * We don't need zl_vdev_lock here because we're the zl_writer, * and all zl_get_data() callbacks are done. */ if (avl_numnodes(t) == 0) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(zio, vd); kmem_free(zv, sizeof (*zv)); } /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ (void) zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); } /* * Function called when a log block write completes */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; dmu_tx_t *tx = lwb->lwb_tx; ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(BP_GET_FILL(zio->io_bp) == 0); /* * Ensure the lwb buffer pointer is cleared before releasing * the txg. If we have had an allocation failure and * the txg is waiting to sync then we want want zil_sync() * to remove the lwb so that it's not picked up as the next new * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in * which we allocated the next block sync. */ dmu_tx_commit(tx); } /* * Initialize the io for a log block. */ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); if (zilog->zl_root_zio == NULL) { zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ 8192+4096, /* data base */ 32*1024 + 4096, /* NFS writes */ UINT64_MAX }; /* * Use the slog as long as the logbias is 'latency' and the current commit size * is less than the limit or the total list size is less than 2X the limit. * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. */ uint64_t zil_slog_limit = 1024 * 1024; #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ (((zilog)->zl_cur_used < zil_slog_limit) || \ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) /* * Start a log block write and advance to the next log block. * Calls are serialized. */ static lwb_t * zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; uint64_t zil_blksz, wsz; int i, error; if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; bp = &zilc->zc_next_blk; } else { zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); bp = &zilc->zc_next_blk; } ASSERT(lwb->lwb_nused <= lwb->lwb_sz); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). * We dirty the dataset to ensure that zil_sync() will be called * to clean up in the event of allocation failure or I/O failure. */ tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); lwb->lwb_tx = tx; /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i]; i++) continue; zil_blksz = zil_block_buckets[i]; if (zil_blksz == UINT64_MAX) zil_blksz = SPA_OLD_MAXBLOCKSIZE; zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, USE_SLOG(zilog)); if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). */ nlwb = zil_alloc_lwb(zilog, bp, txg); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); } if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_zio, wsz); } else { wsz = lwb->lwb_sz; } zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; /* * clear unused data for security */ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ /* * If there was an allocation failure then nlwb will be null which * forces a txg_wait_synced(). */ return (nlwb); } static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrc = &itx->itx_lr; /* common log record */ lr_write_t *lrw = (lr_write_t *)lrc; char *lr_buf; uint64_t txg = lrc->lrc_txg; uint64_t reclen = lrc->lrc_reclen; uint64_t dlen = 0; if (lwb == NULL) return (NULL); ASSERT(lwb->lwb_buf != NULL); ASSERT(zilog_is_dirty(zilog) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); zilog->zl_cur_used += (reclen + dlen); zil_lwb_write_init(zilog, lwb); /* * If this record won't fit in the current log block, start a new one. */ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } } lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); lrc = (lr_t *)lr_buf; lrw = (lr_write_t *)lrc; /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lrc->lrc_txtype == TX_WRITE) { if (txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state != WR_COPIED) { char *dbuf; int error; if (dlen) { ASSERT(itx->itx_wr_state == WR_NEED_COPY); dbuf = lr_buf + reclen; lrw->lr_common.lrc_reclen += dlen; } else { ASSERT(itx->itx_wr_state == WR_INDIRECT); dbuf = NULL; } error = zilog->zl_get_data( itx->itx_private, lrw, dbuf, lwb->lwb_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); } } } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ lwb->lwb_nused += reclen + dlen; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); return (lwb); } itx_t * zil_itx_create(uint64_t txtype, size_t lrsize) { itx_t *itx; lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ return (itx); } void zil_itx_destroy(itx_t *itx) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(itxs_t *itxs) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; if (o1 < o2) return (-1); if (o1 > o2) return (1); return (0); } /* * Remove all async itx with the given oid. */ static void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Object ids can be re-instantiated in the next txg so * remove any async transactions to avoid future leaks. * This can happen if a fsync occurs on the re-instantiated * object for a WR_INDIRECT or WR_NEED_COPY write, which gets * the new file data and flushes a write record for the old object. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) zil_remove_async(zilog, itx->itx_oid); /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean = itxg->itxg_itxs; } ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); zilog_dirty(zilog, txg); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been comitted) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT(itxg->itxg_txg != 0); ASSERT(zilog->zl_clean_taskq != NULL); atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ if (taskq_dispatch(zilog->zl_clean_taskq, (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) zil_itxg_clean(clean_me); } /* * Get the list of itxs to commit into zl_itx_commit_list. */ static void zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; uint64_t push_sod = 0; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); push_sod += itxg->itxg_sod; itxg->itxg_sod = 0; mutex_exit(&itxg->itxg_lock); } atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); } /* * Move the async itxs for a specified object to commit into sync lists. */ static void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } static void zil_commit_writer(zilog_t *zilog) { uint64_t txg; itx_t *itx; lwb_t *lwb; spa_t *spa = zilog->zl_spa; int error = 0; ASSERT(zilog->zl_root_zio == NULL); mutex_exit(&zilog->zl_lock); zil_get_commit_list(zilog); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_head(&zilog->zl_itx_commit_list) == NULL) { mutex_enter(&zilog->zl_lock); return; } if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) lwb = zil_create(zilog); } DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); while (itx = list_head(&zilog->zl_itx_commit_list)) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); list_remove(&zilog->zl_itx_commit_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { error = zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; zil_flush_vdevs(zilog); } if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); mutex_enter(&zilog->zl_lock); /* * Remember the highest committed log sequence number for ztest. * We only update this value when all the log writes succeeded, * because ztest wants to ASSERT that it got the whole log chain. */ if (error == 0 && lwb != NULL) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } /* * Commit zfs transactions to stable storage. * If foid is 0 push out all transactions, otherwise push only those * for that object or might reference that object. * * itxs are committed in batches. In a heavily stressed zil there will be * a commit writer thread who is writing out a bunch of itxs to the log * for a set of committing threads (cthreads) in the same batch as the writer. * Those cthreads are all waiting on the same cv for that batch. * * There will also be a different and growing batch of threads that are * waiting to commit (qthreads). When the committing batch completes * a transition occurs such that the cthreads exit and the qthreads become * cthreads. One of the new cthreads becomes the writer thread for the * batch. Any new threads arriving become new qthreads. * * Only 2 condition variables are needed and there's no transition * between the two cvs needed. They just flip-flop between qthreads * and cthreads. * * Using this scheme we can efficiently wakeup up only those threads * that have been committed. */ void zil_commit(zilog_t *zilog, uint64_t foid) { uint64_t mybatch; if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; /* move the async itxs for the foid to the sync queues */ zil_async_to_sync(zilog, foid); mutex_enter(&zilog->zl_lock); mybatch = zilog->zl_next_batch; while (zilog->zl_writer) { cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); if (mybatch <= zilog->zl_com_batch) { mutex_exit(&zilog->zl_lock); return; } } zilog->zl_next_batch++; zilog->zl_writer = B_TRUE; zil_commit_writer(zilog); zilog->zl_com_batch = mybatch; zilog->zl_writer = B_FALSE; mutex_exit(&zilog->zl_lock); /* wake up one thread to become the next writer */ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); /* wake up all threads waiting for this batch to be committed */ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zil_fini(void) { kmem_cache_destroy(zil_lwb_cache); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_next_batch = 1; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); avl_destroy(&zilog->zl_vdev_tree); mutex_destroy(&zilog->zl_vdev_lock); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (int i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_lock); cv_destroy(&zilog->zl_cv_writer); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_cv_batch[0]); cv_destroy(&zilog->zl_cv_batch[1]); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); ASSERT(zilog->zl_clean_taskq == NULL); ASSERT(zilog->zl_get_data == NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 2, 2, TASKQ_PREPOPULATE); return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg = 0; zil_commit(zilog, 0); /* commit all itx */ /* * The lwb_max_txg for the stubby lwb will reflect the last activity * for the zil. After a txg_wait_synced() on the txg we know all the * callbacks have occurred that may clean the zil. Only then can we * destroy the zl_clean_taskq. */ mutex_enter(&zilog->zl_lock); lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); ASSERT(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; /* * We should have only one LWB left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); kmem_cache_free(zil_lwb_cache, lwb); } mutex_exit(&zilog->zl_lock); } static char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t **zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, lr_t *lr, int error) { char name[MAXNAMELEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, ((lr_ooo_t *)lr)->lr_foid, NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } /* ARGSUSED */ static int zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. */ void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name); zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name); } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } /* ARGSUSED */ int zil_vdev_offline(const char *osname, void *arg) { int error; error = zil_suspend(osname, NULL); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 275810) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 275811) @@ -1,3493 +1,3493 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); #if defined(__amd64__) static int zio_use_uma = 1; #else static int zio_use_uma = 0; #endif SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); static int zio_exclude_metadata = 0; SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); zio_trim_stats_t zio_trim_stats = { { "bytes", KSTAT_DATA_UINT64, "Number of bytes successfully TRIMmed" }, { "success", KSTAT_DATA_UINT64, "Number of successful TRIM requests" }, { "unsupported", KSTAT_DATA_UINT64, "Number of TRIM requests that failed because TRIM is not supported" }, { "failed", KSTAT_DATA_UINT64, "Number of TRIM requests that failed for reasons other than not supported" }, }; static kstat_t *zio_trim_ksp; /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", "zio_ioctl" }; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (!zio_use_uma) goto out; /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifdef illumos #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif #endif /* illumos */ if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = MIN(p2 >> 2, PAGESIZE); } if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); /* * Since zio_data bufs do not appear in crash dumps, we * pass KMC_NOTOUCH so that no allocator metadata is * stored with the buffers. */ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags | KMC_NOTOUCH | KMC_NODEBUG); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } out: zio_inject_init(); zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", KSTAT_TYPE_NAMED, sizeof(zio_trim_stats) / sizeof(kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_trim_ksp != NULL) { zio_trim_ksp->ks_data = &zio_trim_stats; kstat_install(zio_trim_ksp); } } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); if (zio_trim_ksp != NULL) { kstat_delete(zio_trim_ksp); zio_trim_ksp = NULL; } } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; int flags = zio_exclude_metadata ? KM_NODEBUG : 0; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP|flags)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_buf_cache[c], buf); else kmem_free(buf, size); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_data_buf_cache[c], buf); else kmem_free(buf, size); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_data = data; zio->io_size = size; } static void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) zio_buf_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks and decompression * ========================================================================== */ static void zio_subblock(zio_t *zio, void *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) bcopy(zio->io_data, data, size); } static void zio_decompress(zio_t *zio, void *data, uint64_t size) { if (zio->io_error == 0 && zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_data, data, zio->io_size, size) != 0) zio->io_error = SET_ERROR(EIO); } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ /* * NOTE - Callers to zio_walk_parents() and zio_walk_children must * continue calling these functions until they return NULL. * Otherwise, the next caller will pick up the list walk in * some indeterminate state. (Otherwise every caller would * have to pass in a cookie to keep the state represented by * io_walk_link, which gets annoying.) */ zio_t * zio_walk_parents(zio_t *cio) { zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); cio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_child == cio); return (zl->zl_parent); } zio_t * zio_walk_children(zio_t *pio) { zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); pio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_parent == pio); return (zl->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_t *pio = zio_walk_parents(cio); VERIFY(zio_walk_parents(cio) == NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT(cio->io_child_type <= pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) { uint64_t *countp = &zio->io_children[child][wait]; boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; zio->io_stall = countp; waiting = B_TRUE; } mutex_exit(&zio->io_lock); return (waiting); } static void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; mutex_exit(&pio->io_lock); zio_execute(pio); } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } return (zio); } static void zio_destroy(zio_t *zio) { list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). */ if (data == NULL && zio->io_prop.zp_dedup_verify) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, BP_GET_PSIZE(bp), 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); if (zfs_trim_enabled) stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_ASSESS; /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; flags |= ZIO_FLAG_DONT_QUEUE; zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; dprintf_bp(bp, "claiming in txg %llu", txg); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT(txg == spa_first_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, offset, size, done, private, priority, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_eck) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ void *wbuf = zio_buf_alloc(size); bcopy(data, wbuf, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* Not all IO types require vdev io done stage e.g. free */ if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } zio_t * zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) { ASSERT(vd->vdev_ops->vdev_op_leaf); return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT(zio->io_executor == NULL); ASSERT(zio->io_orig_size == zio->io_size); ASSERT(size <= zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) zio->io_orig_size = zio->io_size = size; } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, zio->io_data); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_size; uint64_t psize = lsize; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (ZIO_PIPELINE_CONTINUE); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } zio->io_bp_override = NULL; BP_ZERO(bp); } if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } if (compress != ZIO_COMPRESS_OFF) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (ZIO_PIPELINE_CONTINUE); } else { /* * Round up compressed size to MINBLOCKSIZE and * zero the tail. */ size_t rounded = P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); if (rounded > psize) { bzero((char *)cbuf + psize, rounded - psize); psize = rounded; } if (psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else { zio_push_transform(zio, cbuf, psize, lsize, NULL); } } } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { ASSERT(psize != 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (ZIO_PIPELINE_CONTINUE); } static int zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static int zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } /* * Execute the I/O pipeline until one of the following occurs: * * (1) the I/O completes * (2) the pipeline stalls waiting for dependent child I/Os * (3) the I/O issues, so we're waiting for an I/O completion interrupt * (4) the I/O is delegated by vdev-level caching or aggregation * (5) the I/O is deferred due to vdev-level queueing * (6) the I/O is handed off to another thread. * * In all cases, the pipeline stops whenever there's no CPU work; it never * burns a thread in cv_wait(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { zio->io_executor = curthread; while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) cv_wait(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } zio_execute(zio); } /* * ========================================================================== * Reexecute or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) zio_execute(pio); } void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = B_FALSE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { zio_t *zio; if (gn != NULL) { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(zio->io_data, zio->io_size); ASSERT(zio->io_data == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); data = (char *)data + BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); } static int zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (ZIO_PIPELINE_CONTINUE); } static int zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int error; error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); } /* * The zio_nop_write stage in the pipeline determines if allocating * a new bp is necessary. By leveraging a cryptographically secure checksum, * such as SHA256, we can compare the checksums of the new data and the old * to determine if allocating a new block is required. The nopwrite * feature can handle writes in either syncing or open context (i.e. zil * writes) and as a result is mutually exclusive with dedup. */ static int zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (ZIO_PIPELINE_CONTINUE); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else zio_buf_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } static int zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (ZIO_PIPELINE_CONTINUE); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_buf_alloc(zio->io_size), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); } static int zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (ZIO_PIPELINE_CONTINUE); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (ZIO_PIPELINE_CONTINUE); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || bcmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); while ((pio = zio_walk_parents(zio)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { while (zio_walk_parents(zio) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; zio_prop_t *zp = &zio->io_prop; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static int zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, &czp, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (ZIO_PIPELINE_CONTINUE); } ddt_entry_t *freedde; /* for debugging */ static int zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); /* * The dump device does not support gang blocks so allocation on * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid * the "fast" gang feature. */ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog) { int error = 1; ASSERT(txg > spa_syncing_txg(spa)); /* * ZIL blocks are always contiguous (i.e. not gang blocks) so we * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" * when allocating them. */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); } if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } return (error); } /* * Free an intent log block. */ void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); zio_free(spa, txg, bp); } /* * ========================================================================== * Read, write and delete to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; int ret; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); return (ZIO_PIPELINE_CONTINUE); } /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care * about non-scrubbing, top-level reads and writes with the following * characteristics: * - synchronous writes of user data to non-slog devices * - any reads of user data * When these conditions are met, adjust the timestamp of spa_last_io * which allows the scan thread to adjust its workload accordingly. */ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && vd == vd->vdev_top && !vd->vdev_islog && zio->io_bookmark.zb_objset != DMU_META_OBJSET && zio->io_txg != spa_syncing_txg(spa)) { uint64_t old = spa->spa_last_io; uint64_t new = ddi_get_lbolt64(); if (old != new) (void) atomic_cas_64(&spa->spa_last_io, old, new); } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = NULL; if (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE) abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, abuf ? asize : 0, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering with nested replication. * For example, given a mirror of mirrors, (A+B)+(C+D), if only * A is out of date, we'll read from C+D, then use the data to * resilver A+B -- but we don't actually want to resilver B, just A. * The top-level mirror has no way to know this, so instead we just * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } break; } /* * Note that we ignore repair writes for TRIM because they can * conflict with normal writes. This isn't an issue because, by * definition, we only repair blocks that aren't freed. */ if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) return (ZIO_PIPELINE_STOP); } vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } static int zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); if (vd != NULL && vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE)) { if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) trim_map_write_done(zio); vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(vd, zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FREE) { /* Not all devices support TRIM. */ } else if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const void *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); bcopy(zio->io_data, buf, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_buf_free; } static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); if (zio->io_type == ZIO_TYPE_FREE && zio->io_priority != ZIO_PRIORITY_NOW) { switch (zio->io_error) { case 0: ZIO_TRIM_STAT_INCR(bytes, zio->io_size); ZIO_TRIM_STAT_BUMP(success); break; case EOPNOTSUPP: ZIO_TRIM_STAT_BUMP(unsupported); break; default: ZIO_TRIM_STAT_BUMP(failed); break; } } /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (ZIO_PIPELINE_CONTINUE); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static int zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } static int zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (ZIO_PIPELINE_CONTINUE); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (bp != NULL && !BP_IS_EMBEDDED(bp)) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); char *abuf = zio->io_data; if (asize != psize) { abuf = zio_buf_alloc(asize); bcopy(zio->io_data, abuf, psize); bzero(abuf + psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); if (asize != psize) zio_buf_free(abuf, asize); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(spa, zio); zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0); } } if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(vd == NULL && bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (ZIO_PIPELINE_STOP); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_free_bp_init, zio_issue_async, zio_write_bp_init, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* dnp is the dnode for zb1->zb_object */ boolean_t zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { uint64_t zb1nextL0, zb2thisobj; ASSERT(zb1->zb_objset == zb2->zb_objset); ASSERT(zb2->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); zb1nextL0 = (zb1->zb_blkid + 1) << ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); zb2thisobj = zb2->zb_object ? zb2->zb_object : zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { uint64_t nextobj = zb1nextL0 * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; return (nextobj <= zb2thisobj); } if (zb1->zb_object < zb2thisobj) return (B_TRUE); if (zb1->zb_object > zb2thisobj) return (B_FALSE); if (zb2->zb_object == DMU_META_DNODE_OBJECT) return (B_FALSE); return (zb1nextL0 <= zb2->zb_blkid); } Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 275810) +++ head/sys/cddl/contrib/opensolaris (revision 275811) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r275783