Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 351076) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 351077) @@ -1,5674 +1,5651 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. * Copyright 2017 RackTop Systems. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef verify #include #include "zdb.h" #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ DMU_OT_ZAP_OTHER : \ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) #ifndef lint extern int reference_tracking_enable; extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int aok; #else int reference_tracking_enable; boolean_t zfs_recover; uint64_t zfs_arc_max, zfs_arc_meta_limit; int zfs_vdev_async_read_max_active; boolean_t spa_load_verify_dryrun; int aok; #endif static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_object = NULL; static unsigned zopt_objects = 0; static libzfs_handle_t *g_zfs; static uint64_t max_inflight = 1000; static int leaked_objects = 0; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); static void mos_obj_refd(uint64_t); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init() { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[ [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] " "[ ...]\n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -E decode and display block from an " "embedded block pointer\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -k examine the checkpointed state " "of the pool\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " "others)\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " "exiting\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, " -o = set global " "variable to an unsigned 32-bit integer value\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -V do verbatim import\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } static void dump_debug_buffer() { if (dump_opt['G']) { (void) printf("\n"); zfs_dbgmsg_print("zdb"); } } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, sizeof (buf)); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { (void) printf("%s", (char *)prop); } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= sizeof (*bpop)) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (dump_opt['d'] < 5) return; for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); (void) printf("\t%s\n", blkbuf); } } /* ARGSUSED */ static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (int64_t i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { int refcount = 0; uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); if (vd->vdev_top == vd && obsolete_sm_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_obj, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", (longlong_t)sm->sm_phys->smp_object); (void) printf(" smp_objsize = 0x%llx\n", (longlong_t)sm->sm_phys->smp_objsize); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; uint64_t word; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (word)), ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)SM_DEBUG_TXG_DECODE(word), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", (u_longlong_t)(offset / sizeof (word)), entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; } if ((uint64_t)alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); metaslab_load_wait(msp); if (!msp->ms_loaded) { VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); } dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } } static void print_vdev_metaslab_header(vdev_t *vd) { (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", (u_longlong_t)vd->vdev_id, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %10s\n", "---------------", "-------------------", "---------------", "-------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_objects > 0) { c = zopt_object[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_objects > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_objects; m++) { if (zopt_object[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_object[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_object[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; const char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } /* from spa_history.c: spa_history_create_obj() */ #define HIS_BUF_LEN_DEF (128 << 10) #define HIS_BUF_LEN_MAX (1 << 30) static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf = NULL; uint64_t bufsize = HIS_BUF_LEN_DEF; uint64_t resid, len, off = 0; uint_t num = 0; int error; time_t tsec; struct tm t; char tbuf[30]; char internalstr[MAXPATHLEN]; if ((buf = malloc(bufsize)) == NULL) (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); do { len = bufsize; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; /* * If the history block is too big, double the buffer * size and try again. */ if (resid == len) { free(buf); buf = NULL; bufsize <<= 1; if ((bufsize >= HIS_BUF_LEN_MAX) || ((buf = malloc(bufsize)) == NULL)) { (void) fprintf(stderr, "Unable to read history: " "out of memory\n"); return; } } } while (len != 0); free(buf); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%ju] %s", zfs_history_event_names[ievent], (uintmax_t)txg, intstr); cmd = internalstr; } tsec = time; (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (int i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } } static void print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf(" %*s: object %llu, %llu local blkptrs, " "%llu subobjs in object %llu, %s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static void dump_deadlist(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; uint64_t empty_bpobj = dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; /* force the tree to be loaded */ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); } } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); (void) printf("\n Deadlist: %s (%s/%s comp)\n", bytes, comp, uncomp); if (dump_opt['d'] < 4) return; (void) printf("\n"); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } } } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); err = dmu_objset_own(path, type, B_TRUE, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, strerror(err)); return (err); } if (dmu_objset_type(*osp) == DMU_OST_ZFS) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dmu_objset_disown(*osp, tag); *osp = NULL; } } sa_os = *osp; return (0); } static void close_objset(objset_t *os, void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dmu_objset_disown(os, tag); sa_attr_table = NULL; sa_os = NULL; } static void fuid_table_destroy() { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group used */ dump_zap, /* ZFS user/group quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, - uint64_t *dnode_slots_used) +dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { - (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); - if (dnode_slots_used != NULL) - *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; - zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } - (void) printf("%10" PRIu64 - " %3u %5s %5s %5s %5s %5s %6s %s%s\n", - object, doi.doi_indirection, iblk, dblk, + (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", ZDB_OT_NAME(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); *print_header = 1; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } if (db != NULL) dmu_buf_rele(db, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void dump_dir(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; int print_header = 1; unsigned i; int error; - uint64_t total_slots_used = 0; - uint64_t max_slot_used = 0; - uint64_t dnode_slots; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, - &print_header, NULL); + &print_header); (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_deadlist(&ds->ds_deadlist); if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_deadlist(&ds->ds_remap_deadlist); } count_ds_mos_objects(ds); } if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; - dump_object(os, 0, verbosity, &print_header, NULL); + dump_object(os, 0, verbosity, &print_header); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { - dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, - NULL); - dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, - NULL); + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header, &dnode_slots); + dump_object(os, object, verbosity, &print_header); object_count++; - total_slots_used += dnode_slots; - max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); - (void) printf(" Dnode slots:\n"); - (void) printf("\tTotal used: %10llu\n", - (u_longlong_t)total_slots_used); - (void) printf("\tMax used: %10llu\n", - (u_longlong_t)max_slot_used); - (void) printf("\tPercent empty: %10lf\n", - (double)(max_slot_used - total_slots_used)*100 / - (double)max_slot_used); - - (void) printf("\n"); - if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } ASSERT3U(object_count, ==, usedobjs); if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) fprintf(stderr, "cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) { vdev_t vd; vdev_t *vdp = &vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vdp->vdev_top = vdp; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); uberblock_t *ub = (void *)((char *)lbl + uoff); if (uberblock_verify(ub)) continue; (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, "Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name) { int err; int header = 1; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header, NULL); + dump_object(os, child_obj, dump_opt['v'], &header); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); dmu_objset_disown(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path); close_objset(os, FTAG); return (err); } static int dump_label(const char *dev) { int fd; vdev_label_t label; char path[MAXPATHLEN]; char *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; uint64_t psize, ashift; boolean_t label_found = B_FALSE; (void) strlcpy(path, dev, sizeof (path)); if (dev[0] == '/') { if (strncmp(dev, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) { (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD)); } } else if (stat64(path, &statbuf) != 0) { char *s; (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, dev); if (((s = strrchr(dev, 's')) == NULL && (s = strchr(dev, 'p')) == NULL) || !isdigit(*(s + 1))) (void) strlcat(path, "s0", sizeof (path)); } if ((fd = open64(path, O_RDONLY)) < 0) { (void) fprintf(stderr, "cannot open '%s': %s\n", path, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) fprintf(stderr, "failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode)) { (void) fprintf(stderr, "cannot use '%s': character device required\n", path); (void) close(fd); exit(1); } psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *config = NULL; if (!dump_opt['q']) { (void) printf("------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("------------------------------------\n"); } if (pread64(fd, &label, sizeof (label), vdev_label_offset(psize, l, 0)) != sizeof (label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); continue; } if (nvlist_unpack(buf, buflen, &config, 0) != 0) { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); ashift = SPA_MINBLOCKSHIFT; } else { nvlist_t *vdev_tree = NULL; if (!dump_opt['q']) dump_nvlist(config, 4); if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; nvlist_free(config); label_found = B_TRUE; } if (dump_opt['u']) dump_label_uberblocks(&label, ashift); } (void) close(fd); return (label_found ? 0 : 2); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) { int error; objset_t *os; error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); if (error != 0) return (0); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!dmu_objset_ds(os)->ds_feature_inuse[f]) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } dump_dir(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) zb->zb_ditto_samevdev++; break; } } if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } /* ARGSUSED */ static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); } /* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (bp == NULL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_ios > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; int kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); int sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "\r%5s completed (%4dMB/s) " "estimated time remaining: %uhr %02umin %02usec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb; ddt_entry_t dde; int error; bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } if (!dump_opt['L']) { ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); } } ASSERT(error == ENOENT); } /* ARGSUSED */ static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(svr->svr_allocd_segs)); if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); /* * Clear everything past what has been synced unless * it's past the spacemap, because we have not allocated * mappings for it yet. */ uint64_t vim_max_offset = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > vim_max_offset) range_tree_clear(svr->svr_allocd_segs, vim_max_offset, sm_end - vim_max_offset); } zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); } spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); space_map_update(prev_obsolete_sm); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); space_map_update(checkpoint_sm); VERIFY0(space_map_iterate(checkpoint_sm, checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); metaslab_unload(msp); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (!dump_opt['L']) { dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); /* * For leak detection, we overload the ms_allocatable trees * to contain allocated segments instead of free segments. * As a result, we can't use the normal metaslab_load/unload * interfaces. */ zdb_leak_init_prepare_indirect_vdevs(spa, zcb); load_concrete_ms_allocatable_trees(spa, SM_ALLOC); /* * On load_concrete_ms_allocatable_trees() we loaded all the * allocated entries from the ms_sm to the ms_allocatable for * each metaslab. If the pool has a checkpoint or is in the * middle of discarding a checkpoint, some of these blocks * may have been freed but their ms_sm may not have been * updated because they are referenced by the checkpoint. In * order to avoid false-positives during leak-detection, we * go through the vdev's checkpoint space map and exclude all * its entries from their relevant ms_allocatable. * * We also aggregate the space held by the checkpoint and add * it to zcb_checkpoint_size. * * Note that at this point we are also verifying that all the * entries on the checkpoint_sm are marked as allocated in * the ms_sm of their relevant metaslab. * [see comment in checkpoint_sm_exclude_entry_cb()] */ zdb_leak_init_exclude_checkpoint(spa, zcb); /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } } else { /* * If leak tracing is disabled, we still need to consider * any checkpointed space in our space verification. */ zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1 << vd->vdev_ashift) { if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1 << vd->vdev_ashift)) { obsolete_bytes += 1 << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); if (bytes_leaked != 0 && (vdev_obsolete_counts_are_precise(vd) || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; metaslab_group_t *mg = vd->vdev_mg; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(mg, ==, msp->ms_group); /* * ms_allocatable has been overloaded * to contain allocated segments. Now that * we finished traversing all blocks, any * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_allocatable. For indirect * vdevs, space remaining in the tree * represents parts of the mapping that are * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } } } umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; } return (leaks); } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; boolean_t leaks = B_FALSE; bzero(&zcb, sizeof (zcb)); (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * Load all space maps as SM_ALLOC maps, then traverse the pool * claiming each block we discover. If the pool is perfectly * consistent, the space maps will be empty when we're done. * Anything left over is a leak; any block we can't claim (because * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (int i = 0; i < max_ncpus; i++) { (void) zio_wait(spa->spa_async_zio_root[i]); spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (int e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, &zcb); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; if (total_found == total_alloc) { if (!dump_opt['L']) (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\tbp count: %10llu\n", (u_longlong_t)tzb->zb_count); (void) printf("\tganged count: %10llu\n", (longlong_t)tzb->zb_gangs); (void) printf("\tbp logical: %10llu avg: %6llu\n", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\tbp physical: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\tbp allocated: %10llu avg:" " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\tbp deduped: %10llu ref>1:" " %6llu deduplication: %6.2f\n", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; const char *typename; /* make sure nicenum has enough space */ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); space_map_update(prev_obsolete_sm); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } if (vdev_obsolete_counts_are_precise(vd)) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } if (vdev_obsolete_sm_object(vd) != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appened to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } } (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX); fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) (void) asprintf(new_path, "%s%s", bogus_name, path_start); if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); space_map_update(checkpoint_sm); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, verify_checkpoint_sm_entry_cb, &vcsec)); dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, (range_tree_func_t *)range_tree_verify, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { spa_t *checkpoint_spa; char *checkpoint_pool; nvlist_t *config = NULL; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, config, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); space_map_update(checkpoint_sm); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } /* ARGSUSED */ static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static range_tree_t *mos_refd_objs; static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_top_zap); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_deadlist(&ds->ds_deadlist); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_deadlist(&dp->dp_origin_snap->ds_deadlist); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t class = 0; class < DDT_CLASSES; class++) { for (uint64_t type = 0; type < DDT_TYPES; type++) { for (uint64_t cksum = 0; cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { ddt_t *ddt = spa->spa_ddt[cksum]; mos_obj_refd(ddt->ddt_object[type][class]); } } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (range_tree_contains(mos_refd_objs, object, 1)) { range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; dmu_object_info(mos, object, &doi); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!range_tree_is_empty(mos_refd_objs)) rv = 2; range_tree_vacate(mos_refd_objs, NULL, NULL); range_tree_destroy(mos_refd_objs); return (rv); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { mos_refd_objs = range_tree_create(NULL, NULL); dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { ASSERT0(dataset_feature_count[f]); continue; } (void) feature_get_refcount(spa, &spa_feature_table[f], &refcount); if (dataset_feature_count[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld datasets != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)dataset_feature_count[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) { rc = verify_device_removal_feature_counts(spa); } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_PHYS 0x0020 #define ZDB_FLAG_RAW 0x0040 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 static int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); (void) write(1, buf, size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the heirarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (*s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } /* ARGSUSED */ static int random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) { return (random_get_pseudo_bytes(buf, len)); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:size[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; const char *s, *vdev; char *p, *dup, *flagstr; int i, error; dup = strdup(thing); s = strtok(dup, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); size = strtoull(s ? s : "", NULL, 16); s = strtok(NULL, ":"); if (s) flagstr = strdup(s); else flagstr = strdup(""); s = NULL; if (size == 0) s = "size must not be zero"; if (!IS_P2ALIGNED(size, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); free(flagstr); free(dup); return; } for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { for (i = 0; flagstr[i]; i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Invalid flag: %c\n", flagstr[i]); continue; } flags |= bit; /* If it's not something with an argument, keep going */ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; p = &flagstr[i + 1]; if (bit == ZDB_FLAG_PRINT_BLKPTR) blkptr_offset = strtoull(p, &p, 16); if (*p != ':' && *p != '\0') { (void) printf("***Invalid flag arg: '%s'\n", s); free(flagstr); free(dup); return; } i += p - &flagstr[i + 1]; /* skip over the number */ } } free(flagstr); vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } psize = size; lsize = size; pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } if (flags & ZDB_FLAG_DECOMPRESS) { /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ enum zio_compress c; void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); abd_copy_to_buf(pbuf2, pabd, psize); VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, random_get_pseudo_bytes_cb, NULL)); VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, SPA_MAXBLOCKSIZE - psize)); for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; lsize -= SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { if (zio_decompress_data(c, pabd, lbuf, psize, lsize) == 0 && zio_decompress_data_buf(c, pbuf2, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (c != ZIO_COMPRESS_FUNCTIONS) break; lsize -= SPA_MINBLOCKSIZE; } umem_free(pbuf2, SPA_MAXBLOCKSIZE); umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize <= psize) { (void) printf("Decompress of %s failed\n", thing); goto out; } buf = lbuf; size = lsize; } else { buf = abd_to_buf(pabd); size = psize; } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, size, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, size, flags); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; char *buf; int err; bzero(&bp, sizeof (bp)); err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); free(buf); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } static boolean_t pool_match(nvlist_t *cfg, char *tgt) { uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; if (guid != 0) { if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) return (v == guid); } else { if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) return (strcmp(s, tgt) == 0); } return (B_FALSE); } static char * find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv) { nvlist_t *pools; nvlist_t *match = NULL; char *name = NULL; char *sepp = NULL; char sep = '\0'; int count = 0; importargs_t args; bzero(&args, sizeof (args)); args.paths = dirc; args.path = dirv; args.can_be_active = B_TRUE; if ((sepp = strpbrk(*target, "/@")) != NULL) { sep = *sepp; *sepp = '\0'; } pools = zpool_search_import(g_zfs, &args); if (pools != NULL) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, configp) == 0); if (pool_match(*configp, *target)) { count++; if (match != NULL) { /* print previously found config */ if (name != NULL) { (void) printf("%s\n", name); dump_nvlist(match, 8); name = NULL; } (void) printf("%s\n", nvpair_name(elem)); dump_nvlist(*configp, 8); } else { match = *configp; name = nvpair_name(elem); } } } } if (count > 1) (void) fatal("\tMatched %d pools - use pool GUID " "instead of pool name or \n" "\tpool name part of a dataset name to select pool", count); if (sepp) *sepp = sep; /* * If pool GUID was specified for pool id, replace it with pool name */ if (name && (strstr(*target, name) != *target)) { int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0); *target = umem_alloc(sz, UMEM_NOFAIL); (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : ""); } *configp = name ? match : NULL; return (name); } int main(int argc, char **argv) { int c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env; boolean_t target_is_spa = B_TRUE; nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { switch (c) { case 'b': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'O': case 'R': case 's': case 'S': case 'u': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; /* NB: Sort single match options below. */ case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " "of inflight I/Os must be greater " "than 0\n"); usage(); } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; kernel_init(FREAD); g_zfs = libzfs_init(); if (g_zfs == NULL) fatal("Fail to initialize zfs"); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); return (0); } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) return (dump_label(argv[0])); if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; return (dump_path(argv[0], argv[1])); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (dump_opt['e']) { char *name = find_zpool(&target, &cfg, nsearch, searchdirs); error = ENOENT; if (name) { if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } error = spa_import(name, cfg, NULL, flags); } } char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_is_spa = B_FALSE; /* * Remove any trailing slash. Later code would get confused * by it, but we want to allow it so that "pool/" can * indicate that we want to dump the topmost filesystem, * rather than the whole pool. */ targetlen = strlen(target); if (targetlen != 0 && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; } if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R']) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else { error = open_objset(target, DMU_OST_ANY, FTAG, &os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); argv++; argc--; if (!dump_opt['R']) { if (argc > 0) { zopt_objects = argc; zopt_object = calloc(zopt_objects, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_objects; i++) { errno = 0; zopt_object[i] = strtoull(argv[i], NULL, 0); if (zopt_object[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_dir(os); } else if (zopt_objects > 0 && !dump_opt['m']) { dump_dir(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['p'] = ZDB_FLAG_PHYS; flagbits['r'] = ZDB_FLAG_RAW; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } if (os != NULL) close_objset(os, FTAG); else spa_close(spa, FTAG); fuid_table_destroy(); dump_debug_buffer(); libzfs_fini(g_zfs); kernel_fini(); return (error); } Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c (revision 351076) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c (revision 351077) @@ -1,424 +1,424 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ /* * Print intent log header and statistics. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zdb.h" extern uint8_t dump_opt[256]; static char tab_prefix[4] = "\t\t\t"; static void print_log_bp(const blkptr_t *bp, const char *prefix) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s%s\n", prefix, blkbuf); } /* ARGSUSED */ static void zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) { lr_create_t *lr = arg; time_t crtime = lr->lr_crtime[0]; char *name, *link; lr_attr_t *lrattr; name = (char *)(lr + 1); if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { lrattr = (lr_attr_t *)(lr + 1); name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } if (txtype == TX_SYMLINK) { link = name + strlen(name) + 1; (void) printf("%s%s -> %s\n", tab_prefix, name, link); } else if (txtype != TX_MKXATTR) { (void) printf("%s%s\n", tab_prefix, name); } (void) printf("%s%s", tab_prefix, ctime(&crtime)); - (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64 - ", mode %" PRIo64 "\n", - tab_prefix, lr->lr_doid, - (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid), - (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid), - lr->lr_mode); - (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64 - ", rdev %#" PRIx64 "\n", - tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev); + (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", tab_prefix, + (u_longlong_t)lr->lr_doid, + (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid), + (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid), + (longlong_t)lr->lr_mode); + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", + tab_prefix, + (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, + (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); } /* ARGSUSED */ static void zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) { lr_remove_t *lr = arg; (void) printf("%sdoid %llu, name %s\n", tab_prefix, (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); } /* ARGSUSED */ static void zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) { lr_link_t *lr = arg; (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, (char *)(lr + 1)); } /* ARGSUSED */ static void zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) { lr_rename_t *lr = arg; char *snm = (char *)(lr + 1); char *tnm = snm + strlen(snm) + 1; (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); } /* ARGSUSED */ static int zil_prt_rec_write_cb(void *data, size_t len, void *unused) { char *cdata = data; for (size_t i = 0; i < len; i++) { if (isprint(*cdata)) (void) printf("%c ", *cdata); else (void) printf("%2X", *cdata); cdata++; } return (0); } /* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) { lr_write_t *lr = arg; abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); if (txtype == TX_WRITE2 || verbose < 5) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, !BP_IS_HOLE(bp) && bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); (void) printf("%s\n", tab_prefix); return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s\n", tab_prefix); return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) goto out; } else { /* data is stored after the end of the lr_write record */ data = abd_alloc(lr->lr_length, B_FALSE); abd_copy_from_buf(data, lr + 1, lr->lr_length); } (void) printf("%s", tab_prefix); (void) abd_iterate_func(data, 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), zil_prt_rec_write_cb, NULL); (void) printf("\n"); out: abd_free(data); } /* ARGSUSED */ static void zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) { lr_truncate_t *lr = arg; (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); } /* ARGSUSED */ static void zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) { lr_setattr_t *lr = arg; time_t atime = (time_t)lr->lr_atime[0]; time_t mtime = (time_t)lr->lr_mtime[0]; (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); if (lr->lr_mask & AT_MODE) { (void) printf("%sAT_MODE %llo\n", tab_prefix, (longlong_t)lr->lr_mode); } if (lr->lr_mask & AT_UID) { (void) printf("%sAT_UID %llu\n", tab_prefix, (u_longlong_t)lr->lr_uid); } if (lr->lr_mask & AT_GID) { (void) printf("%sAT_GID %llu\n", tab_prefix, (u_longlong_t)lr->lr_gid); } if (lr->lr_mask & AT_SIZE) { (void) printf("%sAT_SIZE %llu\n", tab_prefix, (u_longlong_t)lr->lr_size); } if (lr->lr_mask & AT_ATIME) { (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix, (u_longlong_t)lr->lr_atime[0], (u_longlong_t)lr->lr_atime[1], ctime(&atime)); } if (lr->lr_mask & AT_MTIME) { (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix, (u_longlong_t)lr->lr_mtime[0], (u_longlong_t)lr->lr_mtime[1], ctime(&mtime)); } } /* ARGSUSED */ static void zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) { lr_acl_t *lr = arg; (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); } typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; const char *zri_name; uint64_t zri_count; } zil_rec_info_t; static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = NULL, .zri_name = "Total "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "}, {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "}, {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "}, {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "}, {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, }; /* ARGSUSED */ static int print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) { int txtype; int verbose = MAX(dump_opt['d'], dump_opt['i']); /* reduce size of txtype to strip off TX_CI bit */ txtype = lr->lrc_txtype; ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); ASSERT(lr->lrc_txg); (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", (lr->lrc_txtype & TX_CI) ? "CI-" : "", zil_rec_info[txtype].zri_name, (u_longlong_t)lr->lrc_reclen, (u_longlong_t)lr->lrc_txg, (u_longlong_t)lr->lrc_seq); if (txtype && verbose >= 3) zil_rec_info[txtype].zri_print(zilog, txtype, lr); zil_rec_info[txtype].zri_count++; zil_rec_info[0].zri_count++; return (0); } /* ARGSUSED */ static int print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { char blkbuf[BP_SPRINTF_LEN + 10]; int verbose = MAX(dump_opt['d'], dump_opt['i']); const char *claim; if (verbose <= 3) return (0); if (verbose >= 5) { (void) strcpy(blkbuf, ", "); snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), bp); } else { blkbuf[0] = '\0'; } if (claim_txg != 0) claim = "already claimed"; else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) claim = "will claim"; else claim = "won't claim"; (void) printf("\tBlock seqno %llu, %s%s\n", (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); return (0); } static void print_log_stats(int verbose) { unsigned i, w, p10; if (verbose > 3) (void) printf("\n"); if (zil_rec_info[0].zri_count == 0) return; for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) w++; for (i = 0; i < TX_MAX_TYPE; i++) if (zil_rec_info[i].zri_count || verbose >= 3) (void) printf("\t\t%s %*llu\n", zil_rec_info[i].zri_name, w, (u_longlong_t)zil_rec_info[i].zri_count); (void) printf("\n"); } /* ARGSUSED */ void dump_intent_log(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; int verbose = MAX(dump_opt['d'], dump_opt['i']); int i; if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) return; (void) printf("\n ZIL header: claim_txg %llu, " "claim_blk_seq %llu, claim_lr_seq %llu", (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_blk_seq, (u_longlong_t)zh->zh_claim_lr_seq); (void) printf(" replay_seq %llu, flags 0x%llx\n", (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); for (i = 0; i < TX_MAX_TYPE; i++) zil_rec_info[i].zri_count = 0; /* see comment in zil_claim() or zil_check_log_chain() */ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return; if (verbose >= 2) { (void) printf("\n"); (void) zil_parse(zilog, print_log_block, print_log_record, NULL, zh->zh_claim_txg); print_log_stats(verbose); } } Index: head/cddl/contrib/opensolaris/cmd/zdb =================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb (revision 351076) +++ head/cddl/contrib/opensolaris/cmd/zdb (revision 351077) Property changes on: head/cddl/contrib/opensolaris/cmd/zdb ___________________________________________________________________ Modified: svn:mergeinfo ## -0,1 +0,0 ## Reverse-merged /vendor/illumos/dist/cmd/zdb:r350898 Index: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c (revision 351076) +++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c (revision 351077) @@ -1,644 +1,642 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include /* * If dump mode is enabled, the number of bytes to print per line */ #define BYTES_PER_LINE 16 /* * If dump mode is enabled, the number of bytes to group together, separated * by newlines or spaces */ #define DUMP_GROUPING 4 uint64_t total_write_size = 0; uint64_t total_stream_len = 0; FILE *send_stream = 0; boolean_t do_byteswap = B_FALSE; boolean_t do_cksum = B_TRUE; static void usage(void) { (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n"); (void) fprintf(stderr, "\t -v -- verbose\n"); (void) fprintf(stderr, "\t -C -- suppress checksum verification\n"); (void) fprintf(stderr, "\t -d -- dump contents of blocks modified, " "implies verbose\n"); exit(1); } static void * safe_malloc(size_t size) { void *rv = malloc(size); if (rv == NULL) { (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", size); abort(); } return (rv); } /* * ssread - send stream read. * * Read while computing incremental checksum */ static size_t ssread(void *buf, size_t len, zio_cksum_t *cksum) { size_t outlen; if ((outlen = fread(buf, len, 1, send_stream)) == 0) return (0); if (do_cksum) { if (do_byteswap) fletcher_4_incremental_byteswap(buf, len, cksum); else fletcher_4_incremental_native(buf, len, cksum); } total_stream_len += len; return (outlen); } static size_t read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum); if (r == 0) return (0); zio_cksum_t saved_cksum = *cksum; r = ssread(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), cksum); if (r == 0) return (0); if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && !ZIO_CHECKSUM_EQUAL(saved_cksum, drr->drr_u.drr_checksum.drr_checksum)) { fprintf(stderr, "invalid checksum\n"); (void) printf("Incorrect checksum in record header.\n"); (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n", saved_cksum.zc_word[0], saved_cksum.zc_word[1], saved_cksum.zc_word[2], saved_cksum.zc_word[3]); return (0); } return (sizeof (*drr)); } /* * Print part of a block in ASCII characters */ static void print_ascii_block(char *subbuf, int length) { int i; for (i = 0; i < length; i++) { char char_print = isprint(subbuf[i]) ? subbuf[i] : '.'; if (i != 0 && i % DUMP_GROUPING == 0) { (void) printf(" "); } (void) printf("%c", char_print); } (void) printf("\n"); } /* * print_block - Dump the contents of a modified block to STDOUT * * Assume that buf has capacity evenly divisible by BYTES_PER_LINE */ static void print_block(char *buf, int length) { int i; /* * Start printing ASCII characters at a constant offset, after * the hex prints. Leave 3 characters per byte on a line (2 digit * hex number plus 1 space) plus spaces between characters and * groupings. */ int ascii_start = BYTES_PER_LINE * 3 + BYTES_PER_LINE / DUMP_GROUPING + 2; for (i = 0; i < length; i += BYTES_PER_LINE) { int j; int this_line_length = MIN(BYTES_PER_LINE, length - i); int print_offset = 0; for (j = 0; j < this_line_length; j++) { int buf_offset = i + j; /* * Separate every DUMP_GROUPING bytes by a space. */ if (buf_offset % DUMP_GROUPING == 0) { print_offset += printf(" "); } /* * Print the two-digit hex value for this byte. */ unsigned char hex_print = buf[buf_offset]; print_offset += printf("%02x ", hex_print); } (void) printf("%*s", ascii_start - print_offset, " "); print_ascii_block(buf + i, this_line_length); } } int main(int argc, char *argv[]) { char *buf = safe_malloc(SPA_MAXBLOCKSIZE); uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; uint64_t total_records = 0; dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; struct drr_begin *drrb = &thedrr.drr_u.drr_begin; struct drr_end *drre = &thedrr.drr_u.drr_end; struct drr_object *drro = &thedrr.drr_u.drr_object; struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects; struct drr_write *drrw = &thedrr.drr_u.drr_write; struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; struct drr_free *drrf = &thedrr.drr_u.drr_free; struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; char c; boolean_t verbose = B_FALSE; boolean_t very_verbose = B_FALSE; boolean_t first = B_TRUE; /* * dump flag controls whether the contents of any modified data blocks * are printed to the console during processing of the stream. Warning: * for large streams, this can obviously lead to massive prints. */ boolean_t dump = B_FALSE; int err; zio_cksum_t zc = { 0 }; zio_cksum_t pcksum = { 0 }; while ((c = getopt(argc, argv, ":vCd")) != -1) { switch (c) { case 'C': do_cksum = B_FALSE; break; case 'v': if (verbose) very_verbose = B_TRUE; verbose = B_TRUE; break; case 'd': dump = B_TRUE; verbose = B_TRUE; very_verbose = B_TRUE; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); break; } } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, "Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n"); exit(1); } send_stream = stdin; pcksum = zc; while (read_hdr(drr, &zc)) { /* * If this is the first DMU record being processed, check for * the magic bytes and figure out the endian-ness based on them. */ if (first) { if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { do_byteswap = B_TRUE; if (do_cksum) { ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); /* * recalculate header checksum now * that we know it needs to be * byteswapped. */ fletcher_4_incremental_byteswap(drr, sizeof (dmu_replay_record_t), &zc); } } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) { (void) fprintf(stderr, "Invalid stream " "(bad magic number)\n"); exit(1); } first = B_FALSE; } if (do_byteswap) { drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); } /* * At this point, the leading fields of the replay record * (drr_type and drr_payloadlen) have been byte-swapped if * necessary, but the rest of the data structure (the * union of type-specific structures) is still in its * original state. */ if (drr->drr_type >= DRR_NUMTYPES) { (void) printf("INVALID record found: type 0x%x\n", drr->drr_type); (void) printf("Aborting.\n"); exit(1); } drr_record_count[drr->drr_type]++; total_records++; switch (drr->drr_type) { case DRR_BEGIN: if (do_byteswap) { drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } (void) printf("BEGIN record\n"); (void) printf("\thdrtype = %lld\n", DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo)); (void) printf("\tfeatures = %llx\n", DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo)); (void) printf("\tmagic = %llx\n", (u_longlong_t)drrb->drr_magic); (void) printf("\tcreation_time = %llx\n", (u_longlong_t)drrb->drr_creation_time); (void) printf("\ttype = %u\n", drrb->drr_type); (void) printf("\tflags = 0x%x\n", drrb->drr_flags); (void) printf("\ttoguid = %llx\n", (u_longlong_t)drrb->drr_toguid); (void) printf("\tfromguid = %llx\n", (u_longlong_t)drrb->drr_fromguid); (void) printf("\ttoname = %s\n", drrb->drr_toname); if (verbose) (void) printf("\n"); if (drr->drr_payloadlen != 0) { nvlist_t *nv; int sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { free(buf); buf = safe_malloc(sz); } (void) ssread(buf, sz, &zc); if (ferror(send_stream)) perror("fread"); err = nvlist_unpack(buf, sz, &nv, 0); if (err) perror(strerror(err)); nvlist_print(stdout, nv); nvlist_free(nv); } break; case DRR_END: if (do_byteswap) { drre->drr_checksum.zc_word[0] = BSWAP_64(drre->drr_checksum.zc_word[0]); drre->drr_checksum.zc_word[1] = BSWAP_64(drre->drr_checksum.zc_word[1]); drre->drr_checksum.zc_word[2] = BSWAP_64(drre->drr_checksum.zc_word[2]); drre->drr_checksum.zc_word[3] = BSWAP_64(drre->drr_checksum.zc_word[3]); } /* * We compare against the *previous* checksum * value, because the stored checksum is of * everything before the DRR_END record. */ if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum, pcksum)) { (void) printf("Expected checksum differs from " "checksum in stream.\n"); (void) printf("Expected checksum = " "%llx/%llx/%llx/%llx\n", pcksum.zc_word[0], pcksum.zc_word[1], pcksum.zc_word[2], pcksum.zc_word[3]); } (void) printf("END checksum = %llx/%llx/%llx/%llx\n", drre->drr_checksum.zc_word[0], drre->drr_checksum.zc_word[1], drre->drr_checksum.zc_word[2], drre->drr_checksum.zc_word[3]); ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); break; case DRR_OBJECT: if (do_byteswap) { drro->drr_object = BSWAP_64(drro->drr_object); drro->drr_type = BSWAP_32(drro->drr_type); drro->drr_bonustype = BSWAP_32(drro->drr_bonustype); drro->drr_blksz = BSWAP_32(drro->drr_blksz); drro->drr_bonuslen = BSWAP_32(drro->drr_bonuslen); drro->drr_toguid = BSWAP_64(drro->drr_toguid); } if (verbose) { - (void) printf("OBJECT object = %" PRIu64 - " type = %u bonustype = %u blksz = %u" - " bonuslen = %u dn_slots = %u\n", - drro->drr_object, + (void) printf("OBJECT object = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u\n", + (u_longlong_t)drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, - drro->drr_bonuslen, - drro->drr_dn_slots); + drro->drr_bonuslen); } if (drro->drr_bonuslen > 0) { (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen, 8), &zc); if (dump) { print_block(buf, P2ROUNDUP(drro->drr_bonuslen, 8)); } } break; case DRR_FREEOBJECTS: if (do_byteswap) { drrfo->drr_firstobj = BSWAP_64(drrfo->drr_firstobj); drrfo->drr_numobjs = BSWAP_64(drrfo->drr_numobjs); drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid); } if (verbose) { (void) printf("FREEOBJECTS firstobj = %llu " "numobjs = %llu\n", (u_longlong_t)drrfo->drr_firstobj, (u_longlong_t)drrfo->drr_numobjs); } break; case DRR_WRITE: if (do_byteswap) { drrw->drr_object = BSWAP_64(drrw->drr_object); drrw->drr_type = BSWAP_32(drrw->drr_type); drrw->drr_offset = BSWAP_64(drrw->drr_offset); drrw->drr_logical_size = BSWAP_64(drrw->drr_logical_size); drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); drrw->drr_key.ddk_prop = BSWAP_64(drrw->drr_key.ddk_prop); drrw->drr_compressed_size = BSWAP_64(drrw->drr_compressed_size); } uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); /* * If this is verbose and/or dump output, * print info on the modified block */ if (verbose) { (void) printf("WRITE object = %llu type = %u " "checksum type = %u compression type = %u\n" " offset = %llu logical_size = %llu " "compressed_size = %llu " "payload_size = %llu " "props = %llx\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, drrw->drr_checksumtype, drrw->drr_compressiontype, (u_longlong_t)drrw->drr_offset, (u_longlong_t)drrw->drr_logical_size, (u_longlong_t)drrw->drr_compressed_size, (u_longlong_t)payload_size, (u_longlong_t)drrw->drr_key.ddk_prop); } /* * Read the contents of the block in from STDIN to buf */ (void) ssread(buf, payload_size, &zc); /* * If in dump mode */ if (dump) { print_block(buf, payload_size); } total_write_size += payload_size; break; case DRR_WRITE_BYREF: if (do_byteswap) { drrwbr->drr_object = BSWAP_64(drrwbr->drr_object); drrwbr->drr_offset = BSWAP_64(drrwbr->drr_offset); drrwbr->drr_length = BSWAP_64(drrwbr->drr_length); drrwbr->drr_toguid = BSWAP_64(drrwbr->drr_toguid); drrwbr->drr_refguid = BSWAP_64(drrwbr->drr_refguid); drrwbr->drr_refobject = BSWAP_64(drrwbr->drr_refobject); drrwbr->drr_refoffset = BSWAP_64(drrwbr->drr_refoffset); drrwbr->drr_key.ddk_prop = BSWAP_64(drrwbr->drr_key.ddk_prop); } if (verbose) { (void) printf("WRITE_BYREF object = %llu " "checksum type = %u props = %llx\n" " offset = %llu length = %llu\n" "toguid = %llx refguid = %llx\n" " refobject = %llu refoffset = %llu\n", (u_longlong_t)drrwbr->drr_object, drrwbr->drr_checksumtype, (u_longlong_t)drrwbr->drr_key.ddk_prop, (u_longlong_t)drrwbr->drr_offset, (u_longlong_t)drrwbr->drr_length, (u_longlong_t)drrwbr->drr_toguid, (u_longlong_t)drrwbr->drr_refguid, (u_longlong_t)drrwbr->drr_refobject, (u_longlong_t)drrwbr->drr_refoffset); } break; case DRR_FREE: if (do_byteswap) { drrf->drr_object = BSWAP_64(drrf->drr_object); drrf->drr_offset = BSWAP_64(drrf->drr_offset); drrf->drr_length = BSWAP_64(drrf->drr_length); } if (verbose) { (void) printf("FREE object = %llu " "offset = %llu length = %lld\n", (u_longlong_t)drrf->drr_object, (u_longlong_t)drrf->drr_offset, (longlong_t)drrf->drr_length); } break; case DRR_SPILL: if (do_byteswap) { drrs->drr_object = BSWAP_64(drrs->drr_object); drrs->drr_length = BSWAP_64(drrs->drr_length); } if (verbose) { (void) printf("SPILL block for object = %llu " "length = %llu\n", drrs->drr_object, drrs->drr_length); } (void) ssread(buf, drrs->drr_length, &zc); if (dump) { print_block(buf, drrs->drr_length); } break; case DRR_WRITE_EMBEDDED: if (do_byteswap) { drrwe->drr_object = BSWAP_64(drrwe->drr_object); drrwe->drr_offset = BSWAP_64(drrwe->drr_offset); drrwe->drr_length = BSWAP_64(drrwe->drr_length); drrwe->drr_toguid = BSWAP_64(drrwe->drr_toguid); drrwe->drr_lsize = BSWAP_32(drrwe->drr_lsize); drrwe->drr_psize = BSWAP_32(drrwe->drr_psize); } if (verbose) { (void) printf("WRITE_EMBEDDED object = %llu " "offset = %llu length = %llu\n" " toguid = %llx comp = %u etype = %u " "lsize = %u psize = %u\n", (u_longlong_t)drrwe->drr_object, (u_longlong_t)drrwe->drr_offset, (u_longlong_t)drrwe->drr_length, (u_longlong_t)drrwe->drr_toguid, drrwe->drr_compression, drrwe->drr_etype, drrwe->drr_lsize, drrwe->drr_psize); } (void) ssread(buf, P2ROUNDUP(drrwe->drr_psize, 8), &zc); break; } if (drr->drr_type != DRR_BEGIN && very_verbose) { (void) printf(" checksum = %llx/%llx/%llx/%llx\n", (longlong_t)drrc->drr_checksum.zc_word[0], (longlong_t)drrc->drr_checksum.zc_word[1], (longlong_t)drrc->drr_checksum.zc_word[2], (longlong_t)drrc->drr_checksum.zc_word[3]); } pcksum = zc; } free(buf); /* Print final summary */ (void) printf("SUMMARY:\n"); (void) printf("\tTotal DRR_BEGIN records = %lld\n", (u_longlong_t)drr_record_count[DRR_BEGIN]); (void) printf("\tTotal DRR_END records = %lld\n", (u_longlong_t)drr_record_count[DRR_END]); (void) printf("\tTotal DRR_OBJECT records = %lld\n", (u_longlong_t)drr_record_count[DRR_OBJECT]); (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n", (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); (void) printf("\tTotal DRR_WRITE records = %lld\n", (u_longlong_t)drr_record_count[DRR_WRITE]); (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n", (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]); (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]); (void) printf("\tTotal DRR_FREE records = %lld\n", (u_longlong_t)drr_record_count[DRR_FREE]); (void) printf("\tTotal DRR_SPILL records = %lld\n", (u_longlong_t)drr_record_count[DRR_SPILL]); (void) printf("\tTotal records = %lld\n", (u_longlong_t)total_records); (void) printf("\tTotal write size = %lld (0x%llx)\n", (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); (void) printf("\tTotal stream length = %lld (0x%llx)\n", (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); return (0); } Index: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 351076) +++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 351077) @@ -1,6832 +1,6798 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 Martin Matuska . All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright 2017 RackTop Systems. */ /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. * * The overall design of the ztest program is as follows: * * (1) For each major functional area (e.g. adding vdevs to a pool, * creating and destroying datasets, reading and writing objects, etc) * we have a simple routine to test that functionality. These * individual routines do not have to do anything "stressful". * * (2) We turn these simple functionality tests into a stress test by * running them all in parallel, with as many threads as desired, * and spread across as many datasets, objects, and vdevs as desired. * * (3) While all this is happening, we inject faults into the pool to * verify that self-healing data really works. * * (4) Every time we open a dataset, we change its checksum and compression * functions. Thus even individual objects vary from block to block * in which checksum they use and whether they're compressed. * * (5) To verify that we never lose on-disk consistency after a crash, * we run the entire test in a child of the main process. * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number * as part of their data. When reading old data, they verify that * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. * * To turn this into an overnight stress test, use -T to specify run time. * * You can ask more more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. * * When ztest invokes itself it passes all relevant information through a * temporary file which is mmap-ed in the child process. This allows shared * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always * stored at offset 0 of this file and contains information on the size and * number of shared structures in the file. The information stored in this file * must remain backwards compatible with older versions of ztest so that * ztest can invoke them during backwards compatibility testing (-B). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ztest_fd_data = -1; static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; uint64_t zh_opts_size; uint64_t zh_size; uint64_t zh_stats_size; uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; char zo_alt_ztest[MAXNAMELEN]; char zo_alt_libpath[MAXNAMELEN]; uint64_t zo_vdevs; uint64_t zo_vdevtime; size_t zo_vdev_size; int zo_ashift; int zo_mirrors; int zo_raidz; int zo_raidz_parity; int zo_datasets; int zo_threads; uint64_t zo_passtime; uint64_t zo_killrate; int zo_verbose; int zo_init; uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, .zo_dir = { '/', 't', 'm', 'p', '\0' }, .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, .zo_raidz = 4, .zo_raidz_parity = 1, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ .zo_killrate = 70, /* 70% kill rate */ .zo_verbose = 0, .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = 32 << 10 }; extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern boolean_t zfs_abd_scatter_enabled; -extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; typedef struct ztest_shared_ds { uint64_t zd_seq; } ztest_shared_ds_t; static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS() \ (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, ZTEST_IO_WRITE_PATTERN, ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; uint64_t bt_crtxg; } ztest_block_tag_t; typedef struct bufwad { uint64_t bw_index; uint64_t bw_txg; uint64_t bw_data; } bufwad_t; /* * XXX -- fix zfs range locks to be generic so we can use them here. */ typedef enum { RL_READER, RL_WRITER, RL_APPEND } rl_type_t; typedef struct rll { void *rll_writer; int rll_readers; kmutex_t rll_lock; kcondvar_t rll_cv; } rll_t; typedef struct rl { uint64_t rl_object; uint64_t rl_offset; uint64_t rl_size; rll_t *rl_lock; } rl_t; #define ZTEST_RANGE_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64 /* * Object descriptor. Used as a template for object lookup/create/remove. */ typedef struct ztest_od { uint64_t od_dir; uint64_t od_object; dmu_object_type_t od_type; dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { ztest_shared_ds_t *zd_shared; objset_t *zd_os; krwlock_t zd_zilog_lock; zilog_t *zd_zilog; ztest_od_t *zd_od; /* debugging aid */ char zd_name[ZFS_MAX_DATASET_NAME_LEN]; kmutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; } ztest_ds_t; /* * Per-iteration state. */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ } ztest_info_t; typedef struct ztest_shared_callstate { uint64_t zc_count; /* per-pass count */ uint64_t zc_time; /* per-pass time */ uint64_t zc_next; /* next time to call this function */ } ztest_shared_callstate_t; static ztest_shared_callstate_t *ztest_shared_callstate; #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) /* * Note: these aren't static because we want dladdr() to work. */ ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; -ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_ddt_repair; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_remap_blocks; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_verify_dnode_bt; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, { ztest_dmu_write_parallel, 10, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, - { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, { ztest_dmu_commit_callbacks, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, { ztest_split_pool, 1, &zopt_always }, { ztest_zil_commit, 1, &zopt_incessant }, { ztest_zil_remount, 1, &zopt_sometimes }, { ztest_dmu_read_write_zcopy, 1, &zopt_often }, { ztest_dmu_objset_create_destroy, 1, &zopt_often }, { ztest_dsl_prop_get_set, 1, &zopt_often }, { ztest_spa_prop_get_set, 1, &zopt_sometimes }, #if 0 { ztest_dmu_prealloc, 1, &zopt_sometimes }, #endif { ztest_fzap, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, { ztest_spa_create_destroy, 1, &zopt_sometimes }, { ztest_fault_inject, 1, &zopt_incessant }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, { ztest_reguid, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_often }, { ztest_spa_upgrade, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_incessant }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime }, { ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime }, { ztest_device_removal, 1, &zopt_sometimes }, { ztest_remap_blocks, 1, &zopt_sometimes }, { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, { ztest_initialize, 1, &zopt_sometimes }, { ztest_verify_dnode_bt, 1, &zopt_sometimes } }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. */ typedef struct ztest_cb_list { kmutex_t zcl_callbacks_lock; list_t zcl_callbacks; } ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; hrtime_t zs_thread_stop; hrtime_t zs_thread_kill; uint64_t zs_enospc_count; uint64_t zs_vdev_next_leaf; uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by * the individual tests. To modify the namespace, consumers must grab * this lock as writer. Grabbing the lock as reader will ensure that the * namespace does not change while the lock is held. */ static krwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, ZTEST_OBJECTS }; static void usage(boolean_t) __NORETURN; /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init() { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #define FATAL_MSG_SZ 1024 char *fatal_msg; static void fatal(int do_perror, char *message, ...) { va_list args; int save_errno = errno; char buf[FATAL_MSG_SZ]; (void) fflush(stdout); va_start(args, message); (void) sprintf(buf, "ztest: "); /* LINTED */ (void) vsprintf(buf + strlen(buf), message, args); va_end(args); if (do_perror) { (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ if (ztest_dump_core) abort(); exit(3); } static int str2shift(const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { return (10*i); } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); /* NOTREACHED */ } static uint64_t nicenumtoull(const char *buf) { char *end; uint64_t val; val = strtoull(buf, &end, 0); if (end == buf) { (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); usage(B_FALSE); } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); if (fval > UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val = (uint64_t)fval; } else { int shift = str2shift(end); if (shift >= 64 || (val << shift) >> shift != val) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val <<= shift; } return (val); } static void usage(boolean_t requested) { const ztest_shared_opts_t *zo = &ztest_opts_defaults; char nice_vdev_size[NN_NUMBUF_SZ]; char nice_force_ganging[NN_NUMBUF_SZ]; FILE *fp = requested ? stdout : stderr; nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, sizeof (nice_force_ganging)); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks (default: %d)]\n" "\t[-R raidz_parity (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" "\t[-i init_count (default: %d)] initialize pool i times\n" "\t[-k kill_percentage (default: %llu%%)]\n" "\t[-p pool_name (default: %s)]\n" "\t[-f dir (default: %s)] file directory for vdev files\n" "\t[-V] verbose (use multiple times for ever more blather)\n" "\t[-E] use existing pool instead of creating new one\n" "\t[-T time (default: %llu sec)] total run time\n" "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" "\t[-h] (print help)\n" "", zo->zo_pool, (u_longlong_t)zo->zo_vdevs, /* -v */ nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ zo->zo_raidz, /* -r */ zo->zo_raidz_parity, /* -R */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ zo->zo_init, /* -i */ (u_longlong_t)zo->zo_killrate, /* -k */ zo->zo_pool, /* -p */ zo->zo_dir, /* -f */ (u_longlong_t)zo->zo_time, /* -T */ (u_longlong_t)zo->zo_maxloops, /* -F */ (u_longlong_t)zo->zo_passtime); exit(requested ? 0 : 1); } static void process_options(int argc, char **argv) { char *path; ztest_shared_opts_t *zo = &ztest_opts; int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:o:")) != EOF) { value = 0; switch (opt) { case 'v': case 's': case 'a': case 'm': case 'r': case 'R': case 'd': case 't': case 'g': case 'i': case 'k': case 'T': case 'P': case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': zo->zo_vdevs = value; break; case 's': zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': zo->zo_ashift = value; break; case 'm': zo->zo_mirrors = value; break; case 'r': zo->zo_raidz = MAX(1, value); break; case 'R': zo->zo_raidz_parity = MIN(MAX(value, 1), 3); break; case 'd': zo->zo_datasets = MAX(1, value); break; case 't': zo->zo_threads = MAX(1, value); break; case 'g': zo->zo_metaslab_force_ganging = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; break; case 'k': zo->zo_killrate = value; break; case 'p': (void) strlcpy(zo->zo_pool, optarg, sizeof (zo->zo_pool)); break; case 'f': path = realpath(optarg, NULL); if (path == NULL) { (void) fprintf(stderr, "error: %s: %s\n", optarg, strerror(errno)); usage(B_FALSE); } else { (void) strlcpy(zo->zo_dir, path, sizeof (zo->zo_dir)); } break; case 'V': zo->zo_verbose++; break; case 'E': zo->zo_init = 0; break; case 'T': zo->zo_time = value; break; case 'P': zo->zo_passtime = MAX(1, value); break; case 'F': zo->zo_maxloops = MAX(1, value); break; case 'B': (void) strlcpy(altdir, optarg, sizeof (altdir)); break; case 'o': if (set_global_var(optarg) != 0) usage(B_FALSE); break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); if (strlen(altdir) > 0) { char *cmd; char *realaltdir; char *bin; char *ztest; char *isa; int isalen; cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY(NULL != realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", altdir); } VERIFY(NULL != realpath(altdir, realaltdir)); /* * 'cmd' should be of the form "/usr/bin//ztest". * We want to extract to determine if we should use * 32 or 64 bit binaries. */ bin = strstr(cmd, "/usr/bin/"); ztest = strstr(bin, "/ztest"); isa = bin + 9; isalen = ztest - isa; (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), "%s/usr/lib/%.*s", realaltdir, isalen, isa); if (0 != access(zo->zo_alt_ztest, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest: %s", zo->zo_alt_ztest); } else if (0 != access(zo->zo_alt_libpath, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate lib directory %s", zo->zo_alt_libpath); } umem_free(cmd, MAXPATHLEN); umem_free(realaltdir, MAXPATHLEN); } } static void ztest_kill(ztest_shared_t *zs) { zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); /* * Before we kill off ztest, make sure that the config is updated. * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); zfs_dbgmsg_print(FTAG); (void) kill(getpid(), SIGKILL); } static uint64_t ztest_random(uint64_t range) { uint64_t r; ASSERT3S(ztest_fd_rand, >=, 0); if (range == 0) return (0); if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); } /* ARGSUSED */ static void ztest_record_enospc(const char *s) { ztest_shared->zs_enospc_count++; } static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char pathbuf[MAXPATHLEN]; uint64_t vdev; nvlist_t *file; if (ashift == 0) ashift = ztest_get_ashift(); if (path == NULL) { path = pathbuf; if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; (void) snprintf(path, sizeof (pathbuf), ztest_aux_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(path, sizeof (pathbuf), ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } if (size != 0) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); if (ftruncate(fd, size) != 0) fatal(1, "can't ftruncate %s", path); (void) close(fd); } VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); return (file); } static nvlist_t * make_vdev_raidz(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { nvlist_t *raidz, **child; int c; if (r < 2) return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, VDEV_TYPE_RAIDZ) == 0); VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, ztest_opts.zo_raidz_parity) == 0); VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, child, r) == 0); for (c = 0; c < r; c++) nvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); return (raidz); } static nvlist_t * make_vdev_mirror(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) return (make_vdev_raidz(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR) == 0); VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, child, m) == 0); for (c = 0; c < m; c++) nvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); return (mirror); } static nvlist_t * make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int log, int r, int m, int t) { nvlist_t *root, **child; int c; ASSERT(t > 0); child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log) == 0); } VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, child, t) == 0); for (c = 0; c < t; c++) nvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); return (root); } /* * Find a random spa version. Returns back a random spa version in the * range [initial_version, SPA_VERSION_FEATURES]. */ static uint64_t ztest_random_spa_version(uint64_t initial_version) { uint64_t version = initial_version; if (version <= SPA_VERSION_BEFORE_FEATURES) { version = version + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); } if (version > SPA_VERSION_BEFORE_FEATURES) version = SPA_VERSION_FEATURES; ASSERT(SPA_VERSION_IS_SUPPORTED(version)); return (version); } static int ztest_random_blocksize(void) { uint64_t block_shift; /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. */ int maxbs = SPA_OLD_MAXBLOCKSHIFT; if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) maxbs = 20; block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int ztest_random_dnodesize(void) { int slots; int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; if (max_slots == DNODE_MIN_SLOTS) return (DNODE_MIN_SIZE); /* * Weight the random distribution more heavily toward smaller * dnode sizes since that is more likely to reflect real-world * usage. */ ASSERT3U(max_slots, >, 4); switch (ztest_random(10)) { case 0: slots = 5 + ztest_random(max_slots - 4); break; case 1 ... 4: slots = 2 + ztest_random(3); break; default: slots = 1; break; } return (slots << DNODE_SHIFT); } static int ztest_random_ibshift(void) { return (DN_MIN_INDBLKSHIFT + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } static uint64_t ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { uint64_t top; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); } static uint64_t ztest_random_dsl_prop(zfs_prop_t prop) { uint64_t value; do { value = zfs_prop_random_value(prop, ztest_random(-1ULL)); } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); return (value); } static int ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, boolean_t inherit) { const char *propname = zfs_prop_to_name(prop); const char *valname; char setpoint[MAXPATHLEN]; uint64_t curval; int error; error = dsl_prop_set_int(osname, propname, (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); if (ztest_opts.zo_verbose >= 6) { VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); } return (error); } static int ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); error = spa_prop_set(spa, props); nvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); return (error); } static void ztest_rll_init(rll_t *rll) { rll->rll_writer = NULL; rll->rll_readers = 0; mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL); cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL); } static void ztest_rll_destroy(rll_t *rll) { ASSERT(rll->rll_writer == NULL); ASSERT(rll->rll_readers == 0); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } static void ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); if (type == RL_READER) { while (rll->rll_writer != NULL) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; } else { while (rll->rll_writer != NULL || rll->rll_readers) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_writer = curthread; } mutex_exit(&rll->rll_lock); } static void ztest_rll_unlock(rll_t *rll) { mutex_enter(&rll->rll_lock); if (rll->rll_writer) { ASSERT(rll->rll_readers == 0); rll->rll_writer = NULL; } else { ASSERT(rll->rll_readers != 0); ASSERT(rll->rll_writer == NULL); rll->rll_readers--; } if (rll->rll_writer == NULL && rll->rll_readers == 0) cv_broadcast(&rll->rll_cv); mutex_exit(&rll->rll_lock); } static void ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_lock(rll, type); } static void ztest_object_unlock(ztest_ds_t *zd, uint64_t object) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_unlock(rll); } static rl_t * ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; rl->rl_offset = offset; rl->rl_size = size; rl->rl_lock = rll; ztest_rll_lock(rll, type); return (rl); } static void ztest_range_unlock(rl_t *rl) { rll_t *rll = rl->rl_lock; ztest_rll_unlock(rll); umem_free(rl, sizeof (*rl)); } static void ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); if (zd->zd_shared != NULL) zd->zd_shared->zd_seq = 0; rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL); mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL); for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_init(&zd->zd_object_lock[l]); for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_init(&zd->zd_range_lock[l]); } static void ztest_zd_fini(ztest_ds_t *zd) { mutex_destroy(&zd->zd_dirobj_lock); for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_destroy(&zd->zd_object_lock[l]); for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_destroy(&zd->zd_range_lock[l]); } #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) static uint64_t ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) { uint64_t txg; int error; /* * Attempt to assign tx to some transaction group. */ error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { ASSERT(txg_how == TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); ztest_record_enospc(tag); } dmu_tx_abort(tx); return (0); } txg = dmu_tx_get_txg(tx); ASSERT(txg != 0); return (txg); } static void ztest_pattern_set(void *buf, uint64_t size, uint64_t value) { uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); while (ip < ip_end) *ip++ = value; } static boolean_t ztest_pattern_match(void *buf, uint64_t size, uint64_t value) { uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); uint64_t diff = 0; while (ip < ip_end) diff |= (value - *ip++); return (diff == 0); } static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; bt->bt_crtxg = crtxg; } static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * ztest_bt_bonus(dmu_buf_t *db) { dmu_object_info_t doi; ztest_block_tag_t *bt; dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); return (bt); } /* * Generate a token to fill up unused bonus buffer space. Try to make * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ -#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ +#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* * Fill up the unused bonus buffer region before the block tag with a * verifiable pattern. Filling the whole bonus area with non-zero data * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); *bonusp = token; } } /* * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); VERIFY3U(*bonusp, ==, token); } } /* * ZIL logging ops */ #define lrz_type lr_mode #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev #define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); itx->itx_oid = object; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) return; if (lr->lr_length > ZIL_MAX_LOG_DATA) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } /* * ZIL replay ops */ static int ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_create_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; ztest_block_tag_t *bbt; dmu_buf_t *db; dmu_tx_t *tx; uint64_t txg; int error = 0; int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT(lr->lr_doid == ZTEST_DIROBJ); ASSERT(name[0] != '\0'); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } else { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); } txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) return (ENOSPC); ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } if (error) { ASSERT3U(error, ==, EEXIST); ASSERT(zd->zd_zilog->zl_replay); dmu_tx_commit(tx); return (error); } ASSERT(lr->lr_foid != 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, lr->lr_gen, txg, txg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); dmu_tx_commit(tx); return (0); } static int ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_remove_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object, txg; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT(lr->lr_doid == ZTEST_DIROBJ); ASSERT(name[0] != '\0'); VERIFY3U(0, ==, zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT(object != 0); ztest_object_lock(zd, object, RL_WRITER); VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_object_unlock(zd, object); return (ENOSPC); } if (doi.doi_type == DMU_OT_ZAP_OTHER) { VERIFY3U(0, ==, zap_destroy(os, object, tx)); } else { VERIFY3U(0, ==, dmu_object_free(os, object, tx)); } VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); ztest_object_unlock(zd, object); return (0); } static int ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_write_t *lr = arg2; objset_t *os = zd->zd_os; void *data = lr + 1; /* data follows lr */ uint64_t offset, length; ztest_block_tag_t *bt = data; ztest_block_tag_t *bbt; uint64_t gen, txg, lrtxg, crtxg; dmu_object_info_t doi; dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } if (bt->bt_magic == BSWAP_64(BT_MAGIC)) byteswap_uint64_array(bt, sizeof (*bt)); if (bt->bt_magic != BT_MAGIC) bt = NULL; ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); gen = bbt->bt_gen; crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, lr->lr_foid, offset, length); if (ztest_random(8) == 0 && length == doi.doi_data_block_size && P2PHASE(offset, length) == 0) abuf = dmu_request_arcbuf(db, length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } if (bt != NULL) { /* * Usually, verify the old data before writing new data -- * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ ASSERT(offset % doi.doi_data_block_size == 0); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } /* * Writes can appear to be newer than the bonus buffer because * the ztest_get_data() callback does a dmu_read() of the * open-context data, which may be different than the data * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } /* * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { bcopy(data, abuf->b_data, length); dmu_assign_arcbuf(db, offset, abuf, tx); } (void) ztest_log_write(zd, tx, lr); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_truncate_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx) == 0); (void) ztest_log_truncate(zd, tx, lr); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_setattr_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_WRITER); VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT(lr->lr_size != 0); ASSERT(lr->lr_mode != 0); ASSERT(lrtxg != 0); } else { /* * Randomly change the size and increment the generation. */ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; ASSERT(lrtxg == 0); } /* * Verify that the current bonus buffer is not newer than our txg. */ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); - dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); dmu_tx_commit(tx); ztest_object_unlock(zd, lr->lr_foid); return (0); } zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* 0 no such transaction type */ ztest_replay_create, /* TX_CREATE */ NULL, /* TX_MKDIR */ NULL, /* TX_MKXATTR */ NULL, /* TX_SYMLINK */ ztest_replay_remove, /* TX_REMOVE */ NULL, /* TX_RMDIR */ NULL, /* TX_LINK */ NULL, /* TX_RENAME */ ztest_replay_write, /* TX_WRITE */ ztest_replay_truncate, /* TX_TRUNCATE */ ztest_replay_setattr, /* TX_SETATTR */ NULL, /* TX_ACL */ NULL, /* TX_CREATE_ACL */ NULL, /* TX_CREATE_ATTR */ NULL, /* TX_CREATE_ACL_ATTR */ NULL, /* TX_MKDIR_ACL */ NULL, /* TX_MKDIR_ATTR */ NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ }; /* * ZIL get_data callbacks */ /* ARGSUSED */ static void ztest_get_done(zgd_t *zgd, int error) { ztest_ds_t *zd = zgd->zgd_private; uint64_t object = zgd->zgd_rl->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); ztest_range_unlock(zgd->zgd_rl); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); } static int ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; uint64_t txg = lr->lr_common.lrc_txg; uint64_t crtxg; dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); return (error); } crtxg = ztest_bt_bonus(db)->bt_crtxg; if (crtxg == 0 || crtxg > txg) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, object); return (ENOENT); } dmu_object_info_from_db(db, &doi); dmu_buf_rele(db, FTAG); db = NULL; zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_lwb = lwb; zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); ASSERT(error == 0); } else { size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); } else { ASSERT(offset < size); offset = 0; } zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); if (error == 0) return (0); } } ztest_get_done(zgd, error); return (error); } static void * ztest_lr_alloc(size_t lrsize, char *name) { char *lr; size_t namesize = name ? strlen(name) + 1 : 0; lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); if (name) bcopy(name, lr + lrsize, namesize); return (lr); } void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; umem_free(lr, lrsize + namesize); } /* * Lookup a bunch of objects. Returns the number of objects not found. */ static int ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (int i = 0; i < count; i++, od++) { od->od_object = 0; error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { ASSERT(error == ENOENT); ASSERT(od->od_object == 0); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; ASSERT(od->od_object != 0); ASSERT(missing == 0); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, RL_READER); VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); od->od_type = doi.doi_type; od->od_blocksize = doi.doi_data_block_size; od->od_gen = bbt->bt_gen; dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, od->od_object); } } return (missing); } static int ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (int i = 0; i < count; i++, od++) { if (missing) { od->od_object = 0; missing++; continue; } lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ lr->lrz_type = od->od_crtype; lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { ASSERT(missing == 0); od->od_object = 0; missing++; } else { od->od_object = lr->lr_foid; od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; ASSERT(od->od_object != 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); od += count - 1; for (int i = count - 1; i >= 0; i--, od--) { if (missing) { missing++; continue; } /* * No object was found. */ if (od->od_object == 0) continue; lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { ASSERT3U(error, ==, ENOSPC); missing++; } else { od->od_object = 0; } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, void *data) { lr_write_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); bcopy(data, lr + 1, size); error = ztest_replay_write(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr) + size, NULL); return (error); } static int ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { lr_truncate_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; error = ztest_replay_truncate(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static int ztest_setattr(ztest_ds_t *zd, uint64_t object) { lr_setattr_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_size = 0; lr->lr_mode = 0; error = ztest_replay_setattr(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static void ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, RL_READER); rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { (void) dmu_free_long_range(os, object, offset, size); } ztest_range_unlock(rl); ztest_object_unlock(zd, object); } static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; uint64_t blocksize; void *data; VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); /* * Pick an i/o type at random, biased toward writing block tags. */ io_type = ztest_random(ZTEST_IO_TYPES); if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; rw_enter(&zd->zd_zilog_lock, RW_READER); switch (io_type) { case ZTEST_IO_WRITE_TAG: ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; case ZTEST_IO_WRITE_PATTERN: (void) memset(data, 'a' + (object + offset) % 5, blocksize); if (ztest_random(2) == 0) { /* * Induce fletcher2 collisions to ensure that * zio_ddt_collision() detects and resolves them * when using fletcher2-verify for deduplication. */ ((uint64_t *)data)[0] ^= 1ULL << 63; ((uint64_t *)data)[4] ^= 1ULL << 63; } (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_WRITE_ZEROES: bzero(data, blocksize); (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_TRUNCATE: (void) ztest_truncate(zd, object, offset, blocksize); break; case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; case ZTEST_IO_REWRITE: rw_enter(&ztest_name_lock, RW_READER); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), B_FALSE); VERIFY(err == 0 || err == ENOSPC); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COMPRESSION, ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), B_FALSE); VERIFY(err == 0 || err == ENOSPC); rw_exit(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, DMU_READ_NO_PREFETCH)); (void) ztest_write(zd, object, offset, blocksize, data); break; } rw_exit(&zd->zd_zilog_lock); umem_free(data, blocksize); } /* * Initialize an object description template. */ static void ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; od->od_blocksize = 0; od->od_gen = 0; (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", tag, (int64_t)id, index); } /* * Lookup or create the objects for a test using the od template. * If the objects do not all exist, or if 'remove' is specified, * remove any existing objects and create new ones. Otherwise, * use the existing objects. */ static int ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) { int count = size / sizeof (*od); int rv = 0; mutex_enter(&zd->zd_dirobj_lock); if ((ztest_lookup(zd, od, count) != 0 || remove) && (ztest_remove(zd, od, count) != 0 || ztest_create(zd, od, count) != 0)) rv = -1; zd->zd_od = od; mutex_exit(&zd->zd_dirobj_lock); return (rv); } /* ARGSUSED */ void ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { zilog_t *zilog = zd->zd_zilog; rw_enter(&zd->zd_zilog_lock, RW_READER); zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child * shared memory. If we die, the next iteration of ztest_run() * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); ASSERT(zd->zd_shared != NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); rw_exit(&zd->zd_zilog_lock); } /* * This function is designed to simulate the operations that occur during a * mount/unmount operation. We hold the dataset across these operations in an * attempt to expose any implicit assumptions about ZIL management. */ /* ARGSUSED */ void ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; /* * We grab the zd_dirobj_lock to ensure that no other thread is * updating the zil (i.e. adding in-memory log records) and the * zd_zilog_lock to block any I/O. */ mutex_enter(&zd->zd_dirobj_lock); rw_enter(&zd->zd_zilog_lock, RW_WRITER); /* zfsvfs_teardown() */ zil_close(zd->zd_zilog); /* zfsvfs_setup() */ VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); rw_exit(&zd->zd_zilog_lock); mutex_exit(&zd->zd_dirobj_lock); } /* * Verify that we can't destroy an active pool, create an existing pool, * or create a pool with a bad vdev spec. */ /* ARGSUSED */ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; /* * Attempt to create using a bad file. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL)); nvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); nvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ rw_enter(&ztest_name_lock, RW_READER); nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); spa_close(spa, FTAG); rw_exit(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; uint64_t version, newversion; nvlist_t *nvroot, *props; char *name; mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); /* * Clean up from previous runs. */ (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * the initial version is capable of supporting that feature. */ switch (ztest_opts.zo_raidz_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; break; case 2: initial_version = SPA_VERSION_RAIDZ2; break; case 3: initial_version = SPA_VERSION_RAIDZ3; break; } /* * Create a pool with a spa version that can be upgraded. Pick * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. */ do { version = ztest_random_spa_version(initial_version); } while (version > SPA_VERSION_BEFORE_FEATURES); props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); VERIFY0(spa_create(name, nvroot, props, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { (void) printf("upgrading spa version from %llu to %llu\n", (u_longlong_t)version, (u_longlong_t)newversion); } spa_upgrade(spa, newversion); VERIFY3U(spa_version(spa), >, version); VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); strfree(name); mutex_exit(&ztest_vdev_lock); } static void ztest_spa_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: break; case ENOSPC: ztest_record_enospc(FTAG); break; default: fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } static void ztest_spa_discard_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint_discard(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_NO_CHECKPOINT: break; default: fatal(0, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } } /* ARGSUSED */ void ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; mutex_enter(&ztest_checkpoint_lock); if (ztest_random(2) == 0) { ztest_spa_checkpoint(spa); } else { ztest_spa_discard_checkpoint(spa); } mutex_exit(&ztest_checkpoint_lock); } static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { vdev_t *mvd; if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != NULL) return (mvd); return (NULL); } /* * Find the first available hole which can be used as a top-level. */ int find_vdev_hole(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); for (c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; if (cvd->vdev_ishole) break; } return (c); } /* * Verify that vdev_add() works as expected. */ /* ARGSUSED */ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { /* * Grab the guid from the head of the log class rotor. */ guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); /* * We have to grab the zs_name_lock as writer to * prevent a race between removing a slog (dmu_objset_find) * and destroying a dataset. Removing the slog will * grab a reference on the dataset which may cause * dmu_objset_destroy() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ rw_enter(&ztest_name_lock, RW_WRITER); error = spa_vdev_remove(spa, guid, B_FALSE); rw_exit(&ztest_name_lock); switch (error) { case 0: case EEXIST: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(0, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); /* * Make 1/4 of the devices be log devices. */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, ztest_random(4) == 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); switch (error) { case 0: break; case ENOSPC: ztest_record_enospc("spa_vdev_add"); break; default: fatal(0, "spa_vdev_add() = %d", error); } } mutex_exit(&ztest_vdev_lock); } /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ /* ARGSUSED */ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; char *aux; uint64_t guid = 0; int error; if (ztest_random(2) == 0) { sav = &spa->spa_spares; aux = ZPOOL_CONFIG_SPARES; } else { sav = &spa->spa_l2cache; aux = ZPOOL_CONFIG_L2CACHE; } mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* * Pick a random device to remove. */ guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; } else { /* * Find an unused device we can add. */ zs->zs_vdev_aux = 0; for (;;) { char path[MAXPATHLEN]; int c; (void) snprintf(path, sizeof (path), ztest_aux_template, ztest_opts.zo_dir, ztest_opts.zo_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) break; if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; zs->zs_vdev_aux++; } } spa_config_exit(spa, SCL_VDEV, FTAG); if (guid == 0) { /* * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { case 0: break; default: fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); } nvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its * vdev state first to make sure we handle removal * of devices that have pending state changes. */ if (ztest_random(2) == 0) (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); switch (error) { case 0: case EBUSY: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } } mutex_exit(&ztest_vdev_lock); } /* * split a pool if it has mirror tlvdevs */ /* ARGSUSED */ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; mutex_enter(&ztest_vdev_lock); /* ensure we have a useable config; mirrors of raidz aren't supported */ if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { mutex_exit(&ztest_vdev_lock); return; } /* clean up the old pool, if any */ (void) spa_destroy("splitp"); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, &tree) == 0); mutex_exit(&spa->spa_props_lock); VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); for (c = 0; c < children; c++) { vdev_t *tvd = rvd->vdev_child[c]; nvlist_t **mchild; uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(schild[schildren], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); VERIFY(nvlist_add_uint64(schild[schildren], ZPOOL_CONFIG_IS_HOLE, 1) == 0); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; VERIFY(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); } /* OK, create a config that can be used to split */ VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, lastlogid != 0 ? lastlogid : schildren) == 0); VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); for (c = 0; c < schildren; c++) nvlist_free(schild[c]); free(schild); nvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); rw_enter(&ztest_name_lock, RW_WRITER); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); rw_exit(&ztest_name_lock); nvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); mutex_enter(&spa_namespace_lock); show_pool_stats(spa); show_pool_stats(spa_lookup("splitp")); mutex_exit(&spa_namespace_lock); ++zs->zs_splits; --zs->zs_mirrors; } mutex_exit(&ztest_vdev_lock); } /* * Verify that we can attach and detach devices. */ /* ARGSUSED */ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; int oldvd_is_log; int error, expected_error; mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * If a vdev is in the process of being removed, its removal may * finish while we are in progress, leading to an unexpected error * value. Don't bother trying to attach while we are in the middle * of removal. */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); /* * Pick a random top-level vdev. */ top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. */ leaf = ztest_random(leaves); /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; } if (ztest_opts.zo_raidz > 1) { ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; } /* * If we're already doing an attach or replace, oldvd may be a * mirror vdev -- in which case, pick a random child. */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; ASSERT(oldvd->vdev_children >= 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) fatal(0, "detach (%s) returned %d", oldpath, error); mutex_exit(&ztest_vdev_lock); return; } /* * For the new vdev, choose with equal probability between the two * standard paths (ending in either 'a' or 'b') or a random hot spare. */ if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); } if (newvd) { /* * Reopen to ensure the vdev's asize field isn't stale. */ vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. * If it's smaller, the attach should fail. * If it's larger, and we're doing a replace, * we should get dynamic LUN growth when we're done. */ newsize = 10 * oldsize / (9 + ztest_random(3)); } /* * If pvd is not a mirror or root, the attach should fail with ENOTSUP, * unless it's a replace; in that case any non-replacing parent is OK. * * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) expected_error = ENOTSUP; else if (newvd_is_spare && (!replacing || oldvd_is_log)) expected_error = ENOTSUP; else if (newvd == oldvd) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; else if (newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else expected_error = 0; spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, 0, 0, 0, 1); error = spa_vdev_attach(spa, oldguid, root, replacing); nvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, * then instead of failing with ENOTSUP we may either succeed, * fail with ENODEV, or fail with EOVERFLOW. */ if (expected_error == ENOTSUP && (error == 0 || error == ENODEV || error == EOVERFLOW)) expected_error = error; /* * If someone grew the LUN, the replacement may be too small. */ if (error == EOVERFLOW || error == EBUSY) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT) expected_error = error; /* XXX workaround 6690467 */ if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); } mutex_exit(&ztest_vdev_lock); } /* ARGSUSED */ void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; int error; mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } /* * Remove a random top-level vdev and wait for removal to finish. */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); error = spa_vdev_remove(spa, guid, B_FALSE); if (error == 0) { ztest_device_removal_active = B_TRUE; mutex_exit(&ztest_vdev_lock); while (spa->spa_vdev_removal != NULL) txg_wait_synced(spa_get_dsl(spa), 0); } else { mutex_exit(&ztest_vdev_lock); return; } /* * The pool needs to be scrubbed after completing device removal. * Failure to do so may result in checksum errors due to the * strategy employed by ztest_fault_inject() when selecting which * offset are redundant and can be damaged. */ error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } mutex_enter(&ztest_vdev_lock); ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } /* * Callback function which expands the physical size of the vdev. */ vdev_t * grow_vdev(vdev_t *vd, void *arg) { spa_t *spa = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); (void) ftruncate(fd, *newsize); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } (void) close(fd); return (NULL); } /* * Callback function which expands a given vdev by calling vdev_online(). */ /* ARGSUSED */ vdev_t * online_vdev(vdev_t *vd, void *arg) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint64_t guid = vd->vdev_guid; uint64_t generation = spa->spa_config_generation + 1; vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ spa_config_exit(spa, SCL_STATE, spa); error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If vdev_online returned an error or the underlying vdev_open * failed then we abort the expand. The only way to know that * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %llu, " "error %d\n", (u_longlong_t)newstate, error); } return (vd); } ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); /* * Since we dropped the lock we need to ensure that we're * still talking to the original vdev. It's possible this * vdev may have been detached/replaced while we were * trying to online it. */ if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %llu, state %llu, expected gen %llu, " "got gen %llu\n", (u_longlong_t)guid, (u_longlong_t)tvd->vdev_state, (u_longlong_t)generation, (u_longlong_t)spa->spa_config_generation); } return (vd); } return (NULL); } /* * Traverse the vdev tree calling the supplied function. * We continue to walk the tree until we either have walked all * children or we receive a non-NULL return from the callback. * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { if (vd->vdev_ops->vdev_op_leaf) { if (func == NULL) return (vd); else return (func(vd, arg)); } for (uint_t c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) return (cvd); } return (NULL); } /* * Verify that dynamic LUN growth works as expected. */ /* ARGSUSED */ void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; size_t psize, newsize; uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If there is a vdev removal in progress, it could complete while * we are running, in which case we would not be able to verify * that the metaslab_class space increased (because it decreases * when the device removal completes). */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; mg = tvd->vdev_mg; mc = mg->mg_class; old_ms_count = tvd->vdev_ms_count; old_class_space = metaslab_class_get_space(mc); /* * Determine the size of the first leaf vdev associated with * our top-level device. */ vd = vdev_walk_tree(tvd, NULL, NULL); ASSERT3P(vd, !=, NULL); ASSERT(vd->vdev_ops->vdev_op_leaf); psize = vd->vdev_psize; /* * We only try to expand the vdev if it's healthy, less than 4x its * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } ASSERT(psize > 0); newsize = psize + psize / 8; ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } /* * Growing the vdev is a two step process: * 1). expand the physical size (i.e. relabel) * 2). online the vdev to create the new metaslabs */ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } spa_config_exit(spa, SCL_STATE, spa); /* * Expanding the LUN will update the config asynchronously, * thus we must wait for the async thread to complete any * pending tasks before proceeding. */ for (;;) { boolean_t done; mutex_enter(&spa->spa_async_lock); done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); mutex_exit(&spa->spa_async_lock); if (done) break; txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); } spa_config_enter(spa, SCL_STATE, spa, RW_READER); tvd = spa->spa_root_vdev->vdev_child[top]; new_ms_count = tvd->vdev_ms_count; new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", old_ms_count, new_ms_count); } /* * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { fatal(0, "LUN expansion failed: class_space %llu < %llu\n", old_class_space, new_class_space); } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); } /* * Verify that dmu_objset_{create,destroy,open,close} work as expected. */ /* ARGSUSED */ static void ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { /* * Create the objects common to all ztest datasets. */ VERIFY(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); } static int ztest_dataset_create(char *dsname) { uint64_t zilset = ztest_random(100); int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, ztest_objset_create_cb, NULL); if (err || zilset < 80) return (err); if (ztest_opts.zo_verbose >= 6) (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } /* ARGSUSED */ static int ztest_objset_destroy_cb(const char *name, void *arg) { objset_t *os; dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } dmu_objset_disown(os, FTAG); /* * Destroy the dataset. */ if (strchr(name, '@') != NULL) { VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); } else { VERIFY0(dsl_destroy_head(name)); } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } if (error != 0 && error != EEXIST) { fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, (u_longlong_t)id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); } /* ARGSUSED */ void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_ds_t zdtmp; int iters; int error; objset_t *os, *os2; char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; rw_enter(&ztest_name_lock, RW_READER); (void) snprintf(name, sizeof (name), "%s/temp_%llu", ztest_opts.zo_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dmu_objset_destroy() * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { ztest_zd_init(&zdtmp, NULL, os); zil_replay(os, &zdtmp, ztest_replay_vector); ztest_zd_fini(&zdtmp); dmu_objset_disown(os, FTAG); } /* * There may be an old instance of the dataset we're about to * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. */ VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); /* * Verify that we can create a new dataset. */ error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); rw_exit(&ztest_name_lock); return; } fatal(0, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); ztest_zd_init(&zdtmp, NULL, os); /* * Open the intent log for it. */ zilog = zil_open(os, ztest_get_data); /* * Put some objects in there, do a little I/O to them, * and randomly take a couple of snapshots along the way. */ iters = ztest_random(5); for (int i = 0; i < iters; i++) { ztest_dmu_object_alloc_free(&zdtmp, id); if (ztest_random(iters) == 0) (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ VERIFY3U(EEXIST, ==, dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); /* * Verify that we can hold an objset that is also owned. */ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* * Verify that we cannot own an objset that is already owned. */ VERIFY3U(EBUSY, ==, dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); zil_close(zilog); dmu_objset_disown(os, FTAG); ztest_zd_fini(&zdtmp); rw_exit(&ztest_name_lock); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { rw_enter(&ztest_name_lock, RW_READER); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); rw_exit(&ztest_name_lock); } /* * Cleanup non-standard snapshots and clones. */ void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char snap1name[ZFS_MAX_DATASET_NAME_LEN]; char clone1name[ZFS_MAX_DATASET_NAME_LEN]; char snap2name[ZFS_MAX_DATASET_NAME_LEN]; char clone2name[ZFS_MAX_DATASET_NAME_LEN]; char snap3name[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snap1name, sizeof (snap1name), "%s@s1_%llu", osname, id); (void) snprintf(clone1name, sizeof (clone1name), "%s/c1_%llu", osname, id); (void) snprintf(snap2name, sizeof (snap2name), "%s@s2_%llu", clone1name, id); (void) snprintf(clone2name, sizeof (clone2name), "%s/c2_%llu", osname, id); (void) snprintf(snap3name, sizeof (snap3name), "%s@s3_%llu", clone1name, id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); } /* * Verify dsl_dataset_promote handles EBUSY */ void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { objset_t *os; char snap1name[ZFS_MAX_DATASET_NAME_LEN]; char clone1name[ZFS_MAX_DATASET_NAME_LEN]; char snap2name[ZFS_MAX_DATASET_NAME_LEN]; char clone2name[ZFS_MAX_DATASET_NAME_LEN]; char snap3name[ZFS_MAX_DATASET_NAME_LEN]; char *osname = zd->zd_name; int error; rw_enter(&ztest_name_lock, RW_READER); ztest_dsl_dataset_cleanup(osname, id); (void) snprintf(snap1name, sizeof (snap1name), "%s@s1_%llu", osname, id); (void) snprintf(clone1name, sizeof (clone1name), "%s/c1_%llu", osname, id); (void) snprintf(snap2name, sizeof (snap2name), "%s@s2_%llu", clone1name, id); (void) snprintf(clone2name, sizeof (clone2name), "%s/c2_%llu", osname, id); (void) snprintf(snap3name, sizeof (snap3name), "%s@s3_%llu", clone1name, id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); } error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); if (error) fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, FTAG); ztest_record_enospc(FTAG); goto out; } if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); dmu_objset_disown(os, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); rw_exit(&ztest_name_lock); } /* * Verify that dmu_object_{alloc,free} work as expected. */ void ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { ztest_od_t od[4]; int batchsize = sizeof (od) / sizeof (od[0]); - for (int b = 0; b < batchsize; b++) { - ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, - 0, 0, 0); - } + for (int b = 0; b < batchsize; b++) + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); /* * Destroy the previous batch of objects, create a new batch, * and do some I/O on the new objects. */ if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) return; while (ztest_random(4 * batchsize) != 0) ztest_io(zd, od[ztest_random(batchsize)].od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); } /* - * Rewind the global allocator to verify object allocation backfilling. - */ -void -ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; - uint64_t object; - - /* - * Rewind the global allocator randomly back to a lower object number - * to force backfilling and reclamation of recently freed dnodes. - */ - mutex_enter(&os->os_obj_lock); - object = ztest_random(os->os_obj_next_chunk); - os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); - mutex_exit(&os->os_obj_lock); -} - -/* * Verify that dmu_{read,write} work as expected. */ void ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[2]; dmu_tx_t *tx; int i, freeit, error; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is arbitrary. It doesn't have to be a power of two, * and it doesn't have any relation to the object blocksize. * The only requirement is that it can hold at least two bufwads. * * Normally, we write the bufwad to each of these locations. * However, free_percent of the time we instead write zeroes to * packobj and perform a dmu_free_range() on bigobj. By comparing * bigobj to packobj, we can verify that the DMU is correctly * tracking which parts of an object are allocated and free, * and that the contents of the allocated blocks are correct. */ /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; ASSERT(chunksize == od[1].od_gen); /* * Prefetch a random chunk of the big object. * Our aim here is to get some async reads in flight * for blocks that we may free below; the DMU should * handle this race correctly. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); /* * free_percent of the time, free a range of bigobj rather than * overwriting it. */ freeit = (ztest_random(100) < free_percent); /* * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); /* This accounts for setting the checksum/compression. */ dmu_tx_hold_bonus(tx, bigobj); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); return; } enum zio_checksum cksum; do { cksum = (enum zio_checksum) ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); dmu_object_set_checksum(os, bigobj, cksum, tx); enum zio_compress comp; do { comp = (enum zio_compress) ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); if (freeit) { bzero(pack, sizeof (bufwad_t)); } else { pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); } *bigH = *pack; *bigT = *pack; } /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY(0 == dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT(bcmp(packbuf, packcheck, packsize) == 0); ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); } void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { uint64_t i; bufwad_t *pack; bufwad_t *bigH; bufwad_t *bigT; /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); *bigH = *pack; *bigT = *pack; } } void ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[2]; dmu_tx_t *tx; uint64_t i; int error; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t blocksize = ztest_random_blocksize(); uint64_t chunksize = blocksize; uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 9; dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is set equal to bigobj block size so that * dmu_assign_arcbuf() can be tested for object updates. */ /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; bigobj = od[0].od_object; packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; ASSERT(chunksize == od[1].od_gen); VERIFY(dmu_object_info(os, bigobj, &doi) == 0); VERIFY(ISP2(doi.doi_data_block_size)); VERIFY(chunksize == doi.doi_data_block_size); VERIFY(chunksize >= 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); /* * Iteration 0 test zcopy for DB_UNCACHED dbufs. * Iteration 1 test zcopy to already referenced dbufs. * Iteration 2 test zcopy to dirty dbuf in the same txg. * Iteration 3 test zcopy to dbuf dirty in previous txg. * Iteration 4 test zcopy when dbuf is no longer dirty. * Iteration 5 test zcopy when it can't be done. * Iteration 6 one more zcopy write. */ for (i = 0; i < 7; i++) { uint64_t j; uint64_t off; /* * In iteration 5 (i == 5) use arcbufs * that don't match bigobj blksz to test * dmu_assign_arcbuf() when it can't directly * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { if (i != 5) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { bigbuf_arcbufs[2 * j] = dmu_request_arcbuf(bonus_db, chunksize / 2); bigbuf_arcbufs[2 * j + 1] = dmu_request_arcbuf(bonus_db, chunksize / 2); } } /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { if (i != 5) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( bigbuf_arcbufs[2 * j]); dmu_return_arcbuf( bigbuf_arcbufs[2 * j + 1]); } } umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); dmu_buf_rele(bonus_db, FTAG); return; } /* * 50% of the time don't read objects in the 1st iteration to * test dmu_assign_arcbuf() for the case when there're no * existing dbufs for the specified offsets. */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[2 * j]->b_data, chunksize / 2); bcopy((caddr_t)bigbuf + (off - bigoff) + chunksize / 2, bigbuf_arcbufs[2 * j + 1]->b_data, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[2 * j], tx); dmu_assign_arcbuf(bonus_db, off + chunksize / 2, bigbuf_arcbufs[2 * j + 1], tx); } if (i == 1) { dmu_buf_rele(dbt, FTAG); } } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY(0 == dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT(bcmp(packbuf, packcheck, packsize) == 0); ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } if (i == 2) { txg_wait_open(dmu_objset_pool(os), 0); } else if (i == 3) { txg_wait_synced(dmu_objset_pool(os), 0); } } dmu_buf_rele(bonus_db, FTAG); umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); } /* ARGSUSED */ void ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) { ztest_od_t od[1]; uint64_t offset = (1ULL << (ztest_random(20) + 43)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* * Have multiple threads write to large offsets in an object * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ - ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, - 0, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; while (ztest_random(10) != 0) ztest_io(zd, od[0].od_object, offset); } void ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) { ztest_od_t od[1]; uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); uint64_t count = ztest_random(20) + 1; uint64_t blocksize = ztest_random_blocksize(); void *data; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) return; ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); data = umem_zalloc(blocksize, UMEM_NOFAIL); while (ztest_random(count) != 0) { uint64_t randoff = offset + (ztest_random(count) * blocksize); if (ztest_write(zd, od[0].od_object, randoff, blocksize, data) != 0) break; while (ztest_random(4) != 0) ztest_io(zd, od[0].od_object, randoff); } umem_free(data, blocksize); } /* * Verify that zap_{create,destroy,add,remove,update} work as expected. */ #define ZTEST_ZAP_MIN_INTS 1 #define ZTEST_ZAP_MAX_INTS 4 #define ZTEST_ZAP_MAX_PROPS 1000 void ztest_zap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; dmu_tx_t *tx; char propname[100], txgname[100]; int error; char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; object = od[0].od_object; /* * Generate a known hash collision, and verify that * we can lookup and remove both entries. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) return; for (i = 0; i < 2; i++) { value[i] = i; VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); VERIFY3U(0, ==, zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); /* * Generate a buch of random entries. */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); bzero(value, sizeof (value)); last_txg = 0; /* * If these zap entries already exist, validate their contents. */ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == 0) { ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); VERIFY(zap_lookup(os, object, txgname, zl_intsize, zl_ints, &last_txg) == 0); VERIFY(zap_length(os, object, propname, &zl_intsize, &zl_ints) == 0); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); VERIFY(zap_lookup(os, object, propname, zl_intsize, zl_ints, value) == 0); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); } } else { ASSERT3U(error, ==, ENOENT); } /* * Atomically update two entries in our zap object. * The first is named txg_%llu, and contains the txg * in which the property was last updated. The second * is named prop_%llu, and the nth element of its value * should be txg + object + n. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) return; if (last_txg > txg) fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); /* * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) return; ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) return; VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); } /* * Testcase to test the upgrading of a microzap to fatzap. */ void ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t object, txg; ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; object = od[0].od_object; /* * Add entries to this ZAP and make sure it spills over * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (int i = 0; i < 2050; i++) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t value = i; dmu_tx_t *tx; int error; (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", id, value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) return; error = zap_add(os, object, name, sizeof (uint64_t), 1, &value, tx); ASSERT(error == 0 || error == EEXIST); dmu_tx_commit(tx); } } /* ARGSUSED */ void ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; dmu_tx_t *tx; int i, namelen, error; int micro = ztest_random(2); char name[20], string_value[20]; void *data; - ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, - 0, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; object = od[0].od_object; /* * Generate a random name of the form 'xxx.....' where each * x is a random printable character and the dots are dots. * There are 94 such characters, and the name length goes from * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. */ namelen = ztest_random(sizeof (name) - 5) + 5 + 1; for (i = 0; i < 3; i++) name[i] = '!' + ztest_random('~' - '!' + 1); for (; i < namelen - 1; i++) name[i] = '.'; name[i] = '\0'; if ((namelen & 1) || micro) { wsize = sizeof (txg); wc = 1; data = &txg; } else { wsize = 1; wc = namelen; data = string_value; } count = -1ULL; VERIFY0(zap_count(os, object, &count)); ASSERT(count != -1ULL); /* * Select an operation: length, lookup, add, update, remove. */ i = ztest_random(5); if (i >= 2) { tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) return; bcopy(name, string_value, namelen); } else { tx = NULL; txg = 0; bzero(string_value, namelen); } switch (i) { case 0: error = zap_length(os, object, name, &zl_wsize, &zl_wc); if (error == 0) { ASSERT3U(wsize, ==, zl_wsize); ASSERT3U(wc, ==, zl_wc); } else { ASSERT3U(error, ==, ENOENT); } break; case 1: error = zap_lookup(os, object, name, wsize, wc, data); if (error == 0) { if (data == string_value && bcmp(name, data, namelen) != 0) fatal(0, "name '%s' != val '%s' len %d", name, data, namelen); } else { ASSERT3U(error, ==, ENOENT); } break; case 2: error = zap_add(os, object, name, wsize, wc, data, tx); ASSERT(error == 0 || error == EEXIST); break; case 3: VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); break; case 4: error = zap_remove(os, object, name, tx); ASSERT(error == 0 || error == ENOENT); break; } if (tx != NULL) dmu_tx_commit(tx); } /* * Commit callback data. */ typedef struct ztest_cb_data { list_node_t zcd_node; uint64_t zcd_txg; int zcd_expected_err; boolean_t zcd_added; boolean_t zcd_called; spa_t *zcd_spa; } ztest_cb_data_t; /* This is the actual commit callback function */ static void ztest_commit_callback(void *arg, int error) { ztest_cb_data_t *data = arg; uint64_t synced_txg; VERIFY(data != NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) fatal(0, "commit callback of txg %" PRIu64 " called prematurely" ", last synced txg = %" PRIu64 "\n", data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; if (error == ECANCELED) { ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* * The private callback data should be destroyed here, but * since we are going to check the zcd_called field after * dmu_tx_abort(), we will destroy it there. */ return; } /* Was this callback added to the global callback list? */ if (!data->zcd_added) goto out; ASSERT3U(data->zcd_txg, !=, 0); /* Remove our callback from the list */ mutex_enter(&zcl.zcl_callbacks_lock); list_remove(&zcl.zcl_callbacks, data); mutex_exit(&zcl.zcl_callbacks_lock); out: umem_free(data, sizeof (ztest_cb_data_t)); } /* Allocate and initialize callback data structure */ static ztest_cb_data_t * ztest_create_cb_data(objset_t *os, uint64_t txg) { ztest_cb_data_t *cb_data; cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); cb_data->zcd_txg = txg; cb_data->zcd_spa = dmu_objset_spa(os); return (cb_data); } /* * If a number of txgs equal to this threshold have been created after a commit * callback has been registered but not called, then we assume there is an * implementation bug. */ #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) /* * Commit callback test. */ void ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t od[1]; dmu_tx_t *tx; ztest_cb_data_t *cb_data[3], *tmp_cb; uint64_t old_txg, txg; int i, error; ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; tx = dmu_tx_create(os); cb_data[0] = ztest_create_cb_data(os, 0); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); /* Every once in a while, abort the transaction on purpose */ if (ztest_random(100) == 0) error = -1; if (!error) error = dmu_tx_assign(tx, TXG_NOWAIT); txg = error ? 0 : dmu_tx_get_txg(tx); cb_data[0]->zcd_txg = txg; cb_data[1] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); if (error) { /* * It's not a strict requirement to call the registered * callbacks from inside dmu_tx_abort(), but that's what * it's supposed to happen in the current implementation * so we will check for that. */ for (i = 0; i < 2; i++) { cb_data[i]->zcd_expected_err = ECANCELED; VERIFY(!cb_data[i]->zcd_called); } dmu_tx_abort(tx); for (i = 0; i < 2; i++) { VERIFY(cb_data[i]->zcd_called); umem_free(cb_data[i], sizeof (ztest_cb_data_t)); } return; } cb_data[2] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* * Read existing data to make sure there isn't a future leak. */ VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, old_txg, txg); dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); mutex_enter(&zcl.zcl_callbacks_lock); /* * Since commit callbacks don't have any ordering requirement and since * it is theoretically possible for a commit callback to be called * after an arbitrary amount of time has elapsed since its txg has been * synced, it is difficult to reliably determine whether a commit * callback hasn't been called due to high load or due to a flawed * implementation. * * In practice, we will assume that if after a certain number of txgs a * commit callback hasn't been called, then most likely there's an * implementation bug.. */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { fatal(0, "Commit callback threshold exceeded, oldest txg: %" PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); } /* * Let's find the place to insert our callbacks. * * Even though the list is ordered by txg, it is possible for the * insertion point to not be the end because our txg may already be * quiescing at this point and other callbacks in the open txg * (from other objsets) may have sneaked in. */ tmp_cb = list_tail(&zcl.zcl_callbacks); while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); /* Add the 3 callbacks to the list */ for (i = 0; i < 3; i++) { if (tmp_cb == NULL) list_insert_head(&zcl.zcl_callbacks, cb_data[i]); else list_insert_after(&zcl.zcl_callbacks, tmp_cb, cb_data[i]); cb_data[i]->zcd_added = B_TRUE; VERIFY(!cb_data[i]->zcd_called); tmp_cb = cb_data[i]; } mutex_exit(&zcl.zcl_callbacks_lock); dmu_tx_commit(tx); } /* * Visit each object in the dataset. Verify that its properties * are consistent what was stored in the block tag when it was created, * and that its unused bonus buffer space has not been overwritten. */ void ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; uint64_t obj; int err = 0; for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ztest_block_tag_t *bt = NULL; dmu_object_info_t doi; dmu_buf_t *db; if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) continue; dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_size >= sizeof (*bt)) bt = ztest_bt_bonus(db); if (bt && bt->bt_magic == BT_MAGIC) { ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, bt->bt_offset, bt->bt_gen, bt->bt_txg, bt->bt_crtxg); ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); } dmu_buf_rele(db, FTAG); } } /* ARGSUSED */ void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { zfs_prop_t proplist[] = { ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; rw_enter(&ztest_name_lock, RW_READER); for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); rw_exit(&ztest_name_lock); } /* ARGSUSED */ void ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) { rw_enter(&ztest_name_lock, RW_READER); int error = dmu_objset_remap_indirects(zd->zd_name); if (error == ENOSPC) error = 0; ASSERT0(error); rw_exit(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { nvlist_t *props = NULL; rw_enter(&ztest_name_lock, RW_READER); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); nvlist_free(props); rw_exit(&ztest_name_lock); } static int user_release_one(const char *snapname, const char *holdname) { nvlist_t *snaps, *holds; int error; snaps = fnvlist_alloc(); holds = fnvlist_alloc(); fnvlist_add_boolean(holds, holdname); fnvlist_add_nvlist(snaps, snapname, holds); fnvlist_free(holds); error = dsl_dataset_user_release(snaps, NULL); fnvlist_free(snaps); return (error); } /* * Test snapshot hold/release and deferred destroy. */ void ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { int error; objset_t *os = zd->zd_os; objset_t *origin; char snapname[100]; char fullname[100]; char clonename[100]; char tag[100]; char osname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *holds; rw_enter(&ztest_name_lock, RW_READER); dmu_objset_name(os, osname); (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%llu", osname, id); (void) snprintf(tag, sizeof (tag), "tag_%llu", id); /* * Clean up from any previous run. */ error = dsl_destroy_head(clonename); if (error != ENOENT) ASSERT0(error); error = user_release_one(fullname, tag); if (error != ESRCH && error != ENOENT) ASSERT0(error); error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != ENOENT) ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); goto out; } fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); fnvlist_add_string(holds, fullname, tag); error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); if (error == ENOSPC) { ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: rw_exit(&ztest_name_lock); } /* * Inject random faults into the on-disk data. */ /* ARGSUSED */ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeULL; uint64_t top, leaf; char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; mutex_enter(&ztest_vdev_lock); /* * Device removal is in progress, fault injection must be disabled * until it completes and the pool is scrubbed. The fault injection * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } maxfaults = MAXFAULTS(); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); ASSERT(leaves >= 1); /* * Grab the name lock as reader. There are some operations * which don't like to have their vdevs changed while * they are in progress (i.e. spa_change_guid). Those * operations will have grabbed the name lock as writer. */ rw_enter(&ztest_name_lock, RW_READER); /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (ztest_random(2) == 0) { /* * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, * and to the random leaf we selected. We'll induce transient * write failures and random online/offline activity on leaf 0, * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, sizeof (path0), ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; /* * If the top-level vdev needs to be resilvered * then we only allow faults on the device that is * resilvering. */ if (vd0 != NULL && maxfaults != 1 && (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwriteable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll * have enough redundancy. If maxfaults == 1, the * combination of this with injection of random data * corruption below exceeds the pool's fault tolerance. */ vdev_file_t *vf = vd0->vdev_tsd; zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_vnode->v_fd); vf->vf_vnode->v_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; } } else { /* * Inject errors on an l2cache device. */ spa_aux_vdev_t *sav = &spa->spa_l2cache; if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); rw_exit(&ztest_name_lock); return; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strcpy(path0, vd0->vdev_path); (void) strcpy(pathrand, vd0->vdev_path); leaf = 0; leaves = 1; maxfaults = INT_MAX; /* no limit on cache devices */ } spa_config_exit(spa, SCL_STATE, FTAG); rw_exit(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing * with a slog, randomly online/offline vd0. */ if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); /* * We have to grab the zs_name_lock as writer to * prevent a race between offlining a slog and * destroying a dataset. Offlining the slog will * grab a reference on the dataset which may cause * dmu_objset_destroy() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ if (islog) rw_enter(&ztest_name_lock, RW_WRITER); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); if (islog) rw_exit(&ztest_name_lock); } else { /* * Ideally we would like to be able to randomly * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from * doing so. We grab the ztest_vdev_lock here to * prevent a race between injection testing and * aux_vdev removal. */ mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); mutex_exit(&ztest_vdev_lock); } } if (maxfaults == 0) return; /* * We have at least single-fault tolerance, so inject data corruption. */ fd = open(pathrand, O_RDWR); if (fd == -1) /* we hit a gap in the device namespace */ return; fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { /* * The offset must be chosen carefully to ensure that * we do not inject a given logical block with errors * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * * We divide each leaf into chunks of size * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk * there is a series of ranges to which we can inject errors. * Each range can accept errors on only a single leaf vdev. * The error injection ranges are separated by ranges * which we will not inject errors on any device (DMZs). * Each DMZ must be large enough such that a single block * can not straddle it, so that a single block can not be * a target in two different injection ranges (on different * leaf vdevs). * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 * 32M to 64M: DMZ - no injection allowed * 64M to 96M: injection range for leaf 1 * 96M to 128M: DMZ - no injection allowed * 128M to 160M: injection range for leaf 2 * 160M to 192M: DMZ - no injection allowed */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); /* * Only allow damage to the labels at one end of the vdev. * * If all labels are damaged, the device will be totally * inaccessible, which will result in loss of data, * because we also damage (parts of) the other side of * the mirror/raidz. * * Additionally, we will always have both an even and an * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; /* * The two end labels are stored at the "end" of the disk, but * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { mutex_exit(&ztest_vdev_lock); (void) close(fd); return; } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } (void) close(fd); } /* * Verify that DDT repair works as expected. */ void ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t object, blocksize, txg, pattern, psize; enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; blocksize = ztest_random_blocksize(); blocksize = MIN(blocksize, 2048); /* because we write so many */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; /* * Take the name lock as writer to prevent anyone else from changing * the pool and dataset properies we need to maintain during this test. */ rw_enter(&ztest_name_lock, RW_WRITER); if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, B_FALSE) != 0 || ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, B_FALSE) != 0) { rw_exit(&ztest_name_lock); return; } dmu_objset_stats_t dds; dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); object = od[0].od_object; blocksize = od[0].od_blocksize; pattern = zs->zs_guid ^ dds.dds_guid; ASSERT(object != 0); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, 0, copies * blocksize); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { rw_exit(&ztest_name_lock); return; } /* * Write all the copies of our block. */ for (int i = 0; i < copies; i++) { uint64_t offset = i * blocksize; int error = dmu_buf_hold(os, object, offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) { fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", os, (long long)object, (long long) offset, error); } ASSERT(db->db_offset == offset); ASSERT(db->db_size == blocksize); ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || ztest_pattern_match(db->db_data, db->db_size, 0ULL)); dmu_buf_will_fill(db, tx); ztest_pattern_set(db->db_data, db->db_size, pattern); dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), txg); /* * Find out what block we got. */ VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); blk = *((dmu_buf_impl_t *)db)->db_blkptr; dmu_buf_rele(db, FTAG); /* * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); abd = abd_alloc_linear(psize, B_TRUE); ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); abd_free(abd); rw_exit(&ztest_name_lock); } /* * Scrub the pool. */ /* ARGSUSED */ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; /* * Scrub in progress by device removal. */ if (ztest_device_removal_active) return; (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ (void) spa_scan(spa, POOL_SCAN_SCRUB); } /* * Change the guid for the pool. */ /* ARGSUSED */ void ztest_reguid(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; uint64_t orig, load; int error; orig = spa_guid(spa); load = spa_load_guid(spa); rw_enter(&ztest_name_lock, RW_WRITER); error = spa_change_guid(spa); rw_exit(&ztest_name_lock); if (error != 0) return; if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %llu -> %llu\n", (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); VERIFY3U(load, ==, spa_load_guid(spa)); } static vdev_t * ztest_random_concrete_vdev_leaf(vdev_t *vd) { if (vd == NULL) return (NULL); if (vd->vdev_children == 0) return (vd); vdev_t *eligible[vd->vdev_children]; int eligible_idx = 0, i; for (i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_top->vdev_removing) continue; if (cvd->vdev_children > 0 || (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { eligible[eligible_idx++] = cvd; } } VERIFY(eligible_idx > 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); } /* ARGSUSED */ void ztest_initialize(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); error = spa_vdev_initialize(spa, guid, cmd); switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; case POOL_INITIALIZE_DO: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start initialize %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_INITIALIZE_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* * Verify pool integrity by running zdb. */ static void ztest_run_zdb(char *pool) { int status; char zdb[MAXPATHLEN + MAXNAMELEN + 20]; char zbuf[1024]; char *bin; char *ztest; char *isa; int isalen; FILE *fp; strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ bin = strstr(zdb, "/usr/bin/"); ztest = strstr(bin, "/ztest"); isa = bin + 8; isalen = ztest - isa; isa = strdup(isa); /* LINTED */ (void) sprintf(bin, "/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s %s", isalen, isa, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", spa_config_path, pool); free(isa); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); fp = popen(zdb, "r"); assert(fp != NULL); while (fgets(zbuf, sizeof (zbuf), fp) != NULL) if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); if (status == 0) return; ztest_dump_core = 0; if (WIFEXITED(status)) fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); } static void ztest_walk_pool_directory(char *header) { spa_t *spa = NULL; if (ztest_opts.zo_verbose >= 6) (void) printf("%s\n", header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } static void ztest_spa_import_export(char *oldname, char *newname) { nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; int error; if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } /* * Clean up from previous runs. */ (void) spa_destroy(newname); /* * Get the pool's configuration and guid. */ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); ztest_walk_pool_directory("pools before export"); /* * Export it. */ VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); /* * Try to import it. */ newconfig = spa_tryimport(config); ASSERT(newconfig != NULL); nvlist_free(newconfig); /* * Import it under the new name. */ error = spa_import(newname, config, NULL, 0); if (error != 0) { dump_nvlist(config, 0); fatal(B_FALSE, "couldn't import pool %s as %s: error %u", oldname, newname, error); } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. */ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); ASSERT(pool_guid == spa_guid(spa)); spa_close(spa, FTAG); nvlist_free(config); } static void ztest_resume(spa_t *spa) { if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa); } static void * ztest_resume_thread(void *arg) { spa_t *spa = arg; while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); /* * Periodically change the zfs_compressed_arc_enabled setting. */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); /* * Periodically change the zfs_abd_scatter_enabled setting. */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); } return (NULL); } static void * ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; hrtime_t delta, total = 0; for (;;) { delta = zs->zs_thread_stop - zs->zs_thread_start + MSEC2NSEC(zfs_deadman_synctime_ms); (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); /* * If the pool is suspended then fail immediately. Otherwise, * check to see if the pool is making any progress. If * vdev_deadman() discovers that there hasn't been any recent * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { fatal(0, "aborting test after %llu seconds because " "pool has transitioned to a suspended state.", zfs_deadman_synctime_ms / 1000); return (NULL); } vdev_deadman(spa->spa_root_vdev); total += zfs_deadman_synctime_ms/1000; (void) printf("ztest has been running for %lld seconds\n", total); } } static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) { ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); for (int i = 0; i < zi->zi_iters; i++) zi->zi_func(zd, id); functime = gethrtime() - functime; atomic_add_64(&zc->zc_count, 1); atomic_add_64(&zc->zc_time, functime); if (ztest_opts.zo_verbose >= 4) { Dl_info dli; (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%6.2f sec in %s\n", (double)functime / NANOSEC, dli.dli_sname); } } static void * ztest_thread(void *arg) { int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ if (now > zs->zs_thread_kill) ztest_kill(zs); /* * If we're getting ENOSPC with some regularity, stop. */ if (zs->zs_enospc_count > 10) break; /* * Pick a random function to execute. */ rand = ztest_random(ZTEST_FUNCS); zi = &ztest_info[rand]; zc = ZTEST_GET_SHARED_CALLSTATE(rand); call_next = zc->zc_next; if (now >= call_next && atomic_cas_64(&zc->zc_next, call_next, call_next + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { ztest_execute(rand, zi, id); } } return (NULL); } static void ztest_dataset_name(char *dsname, char *pool, int d) { (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void ztest_dataset_destroy(int d) { char name[ZFS_MAX_DATASET_NAME_LEN]; ztest_dataset_name(name, ztest_opts.zo_pool, d); if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* * Cleanup any non-standard clones and snapshots. In general, * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ for (int t = d; t < ztest_opts.zo_threads; t += ztest_opts.zo_datasets) { ztest_dsl_dataset_cleanup(name, t); } (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } static void ztest_dataset_dirobj_verify(ztest_ds_t *zd) { uint64_t usedobjs, dirobjs, scratch; /* * ZTEST_DIROBJ is the object directory for the entire dataset. * Therefore, the number of objects in use should equal the * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. * If not, we have an object leak. * * Note that we can only check this in ztest_dataset_open(), * when the open-context and syncing-context values agree. * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } static int ztest_dataset_open(int d) { ztest_ds_t *zd = &ztest_ds[d]; uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[ZFS_MAX_DATASET_NAME_LEN]; int error; ztest_dataset_name(name, ztest_opts.zo_pool, d); rw_enter(&ztest_name_lock, RW_READER); error = ztest_dataset_create(name); if (error == ENOSPC) { rw_exit(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); rw_exit(&ztest_name_lock); ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) fatal(0, "missing log records: claimed %llu < committed %llu", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); zil_replay(os, zd, ztest_replay_vector); ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", zd->zd_name, (u_longlong_t)zilog->zl_parse_blk_count, (u_longlong_t)zilog->zl_parse_lr_count, (u_longlong_t)zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) fatal(0, "missing log records: replayed %llu < committed %llu", zilog->zl_replaying_seq, committed_seq); return (0); } static void ztest_dataset_close(int d) { ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_disown(zd->zd_os, zd); ztest_zd_fini(zd); } /* * Kick off threads to run tests on all datasets in parallel. */ static void ztest_run(ztest_shared_t *zs) { thread_t *tid; spa_t *spa; objset_t *os; thread_t resume_tid; int error; ztest_exiting = B_FALSE; /* * Initialize parent/child shared state. */ mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); zs->zs_thread_start = gethrtime(); zs->zs_thread_stop = zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; if (ztest_random(100) < ztest_opts.zo_killrate) { zs->zs_thread_kill -= ztest_random(ztest_opts.zo_passtime * NANOSEC); } mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL); list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); /* * Open our pool. */ kernel_init(FREAD | FWRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; dmu_objset_stats_t dds; VERIFY0(dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, FTAG, &os)); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, FTAG); spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; /* * We don't expect the pool to suspend unless maxfaults == 0, * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ if (MAXFAULTS() == 0) spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; else spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; /* * Create a thread to periodically resume suspended I/O. */ VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, &resume_tid) == 0); /* * Create a deadman thread to abort() if we hang. */ VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, NULL) == 0); /* * Verify that we can safely inquire about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ for (int t = 0; t < 64; t++) { for (int d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || error == EINVAL); } } /* * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), UMEM_NOFAIL); if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Kick off all the tests that run in parallel. */ for (int t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) return; VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, THR_BOUND, &tid[t]) == 0); } /* * Wait for all of the tests to complete. We go in reverse order * so we don't close datasets while threads are still using them. */ for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { VERIFY(thr_join(tid[t], NULL, NULL) == 0); if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); zfs_dbgmsg_print(FTAG); umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); /* Kill the resume thread */ ztest_exiting = B_TRUE; VERIFY(thr_join(resume_tid, NULL, NULL) == 0); ztest_resume(spa); /* * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ for (uint64_t object = 1; object < 50; object++) { dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, ZIO_PRIORITY_SYNC_READ); } spa_close(spa, FTAG); /* * Verify that we can loop over all pools. */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); /* * Verify that we can export the pool and reimport it under a * different name. */ if (ztest_random(2) == 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s_import", ztest_opts.zo_pool); ztest_spa_import_export(ztest_opts.zo_pool, name); ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); list_destroy(&zcl.zcl_callbacks); mutex_destroy(&zcl.zcl_callbacks_lock); rw_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void ztest_freeze(void) { ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(FREAD | FWRITE); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY3U(0, ==, ztest_dataset_open(0)); ztest_spa = spa; /* * Force the first log block to be transactionally allocated. * We have to do this before we freeze the pool -- otherwise * the log chain won't be anchored. */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); /* * Freeze the pool. This stops spa_sync() from doing anything, * so that the only way to record changes from now on is the ZIL. */ spa_freeze(spa); /* * Because it is hard to predict how much space a write will actually * require beforehand, we leave ourselves some fudge space to write over * capacity. */ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. * * Run a random number of times less than zo_maxloops and ensure we do * not run out of space on the pool. */ while (ztest_random(10) != 0 && numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); /* * Open and close the pool and dataset to induce log replay. */ kernel_init(FREAD | FWRITE); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT(spa_freeze_txg(spa) == UINT64_MAX); VERIFY3U(0, ==, ztest_dataset_open(0)); ztest_dataset_close(0); ztest_spa = spa; txg_wait_synced(spa_get_dsl(spa), 0); ztest_reguid(NULL, 0); spa_close(spa, FTAG); kernel_fini(); } void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; hrtime_t h = m / 60; hrtime_t d = h / 24; s -= m * 60; m -= h * 60; h -= d * 24; timebuf[0] = '\0'; if (d) (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) (void) sprintf(timebuf, "%llum%02llus", m, s); else (void) sprintf(timebuf, "%llus", s); } static nvlist_t * make_random_props() { nvlist_t *props; VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); if (ztest_random(2) == 0) return (props); VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); return (props); } /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. */ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); kernel_init(FREAD | FWRITE); /* * Create the storage pool. */ (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); for (int i = 0; i < SPA_FEATURES; i++) { char buf[1024]; (void) snprintf(buf, sizeof (buf), "feature@%s", spa_feature_table[i].fi_uname); VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); } VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); nvlist_free(nvroot); nvlist_free(props); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); rw_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void setup_data_fd(void) { static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; ztest_fd_data = mkstemp(ztest_name_data); ASSERT3S(ztest_fd_data, >=, 0); (void) unlink(ztest_name_data); } static int shared_data_size(ztest_shared_hdr_t *hdr) { int size; size = hdr->zh_hdr_size; size += hdr->zh_opts_size; size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; return (size); } static void setup_hdr(void) { int size; ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); hdr->zh_size = sizeof (ztest_shared_t); hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } static void setup_data(void) { int size, offset; ztest_shared_hdr_t *hdr; uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; ztest_shared_opts = (void *)&buf[offset]; offset += hdr->zh_opts_size; ztest_shared = (void *)&buf[offset]; offset += hdr->zh_size; ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; } static boolean_t exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } if (pid == -1) fatal(1, "fork failed"); if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; char fd_data_str[12]; struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) close(ztest_fd_rand); VERIFY3U(11, >=, snprintf(fd_data_str, 12, "%d", ztest_fd_data)); VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); #ifdef illumos (void) execv(cmd, emptyargv); #else (void) execvp(cmd, emptyargv); #endif ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); } if (cmdbuf != NULL) { umem_free(cmdbuf, MAXPATHLEN); cmd = NULL; } while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) *statusp = status; if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { (void) fprintf(stderr, "child exited with code %d\n", WEXITSTATUS(status)); exit(2); } return (B_FALSE); } else if (WIFSIGNALED(status)) { if (!ignorekill || WTERMSIG(status) != SIGKILL) { (void) fprintf(stderr, "child died with signal %d\n", WTERMSIG(status)); exit(3); } return (B_TRUE); } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); /* NOTREACHED */ } } static void ztest_run_init(void) { ztest_shared_t *zs = ztest_shared; ASSERT(ztest_opts.zo_init != 0); /* * Blow away any existing copy of zpool.cache */ (void) remove(spa_config_path); /* * Create and initialize our storage pool. */ for (int i = 1; i <= ztest_opts.zo_init; i++) { bzero(zs, sizeof (ztest_shared_t)); if (ztest_opts.zo_verbose >= 3 && ztest_opts.zo_init != 1) { (void) printf("ztest_init(), pass %d\n", i); } ztest_init(zs); } } int main(int argc, char **argv) { int kills = 0; int iters = 0; int older = 0; int newer = 0; ztest_shared_t *zs; ztest_info_t *zi; ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; char *fd_data_str = getenv("ZTEST_FD_DATA"); (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some * of them so the feature get tested. */ zfs_force_some_double_word_sm_entries = B_TRUE; ztest_fd_rand = open("/dev/urandom", O_RDONLY); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { process_options(argc, argv); setup_data_fd(); setup_hdr(); setup_data(); bcopy(&ztest_opts, ztest_shared_opts, sizeof (*ztest_shared_opts)); } else { ztest_fd_data = atoi(fd_data_str); setup_data(); bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); /* Override location of zpool.cache */ VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; if (fd_data_str) { metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; if (zs->zs_do_init) ztest_run_init(); else ztest_run(zs); exit(0); } hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," " %llu seconds...\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, (u_longlong_t)ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { if (ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest for " "initialization: %s\n", ztest_opts.zo_alt_ztest); } VERIFY(!exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_FALSE, NULL)); } else { VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } zs->zs_do_init = B_FALSE; zs->zs_proc_start = gethrtime(); zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (int f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) zc->zc_next = UINT64_MAX; else zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ while (gethrtime() < zs->zs_proc_stop) { int status; boolean_t killed; /* * Initialize the workload counters for each function. */ for (int f = 0; f < ZTEST_FUNCS; f++) { zc = ZTEST_GET_SHARED_CALLSTATE(f); zc->zc_count = 0; zc->zc_time = 0; } /* Set the allocation switch size */ zs->zs_metaslab_df_alloc_threshold = ztest_random(zs->zs_metaslab_sz / 4) + 1; if (!hasalt || ztest_random(2) == 0) { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing newer ztest: %s\n", cmd); } newer++; killed = exec_child(cmd, NULL, B_TRUE, &status); } else { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest: %s\n", ztest_opts.zo_alt_ztest); } older++; killed = exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_TRUE, &status); } if (killed) kills++; iters++; if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); (void) printf("Pass %3d, %8s, %3llu ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", (u_longlong_t)zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / (ztest_opts.zo_time * NANOSEC), timebuf); } if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (int f = 0; f < ZTEST_FUNCS; f++) { Dl_info dli; zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", (u_longlong_t)zc->zc_count, timebuf, dli.dli_sname); } (void) printf("\n"); } ztest_run_zdb(ztest_opts.zo_pool); } if (ztest_opts.zo_verbose >= 1) { if (hasalt) { (void) printf("%d runs of older ztest: %s\n", older, ztest_opts.zo_alt_ztest); (void) printf("%d runs of newer ztest: %s\n", newer, cmd); } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } umem_free(cmd, MAXNAMELEN); return (0); } Index: head/cddl/contrib/opensolaris =================================================================== --- head/cddl/contrib/opensolaris (revision 351076) +++ head/cddl/contrib/opensolaris (revision 351077) Property changes on: head/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,1 +0,0 ## Reverse-merged /vendor/illumos/dist:r350898 Index: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c =================================================================== --- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 351077) @@ -1,715 +1,714 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_deleg.h" #if defined(_KERNEL) #include #else #include #include #include #endif static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; /* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ const char *zfs_userquota_prop_prefixes[] = { "userused@", "userquota@", "groupused@", "groupquota@" }; zprop_desc_t * zfs_prop_get_table(void) { return (zfs_prop_table); } void zfs_prop_init(void) { static zprop_index_t checksum_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "noparity", ZIO_CHECKSUM_NOPARITY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, #ifdef illumos { "edonr", ZIO_CHECKSUM_EDONR }, #endif { NULL } }; static zprop_index_t dedup_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "sha256,verify", ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "sha512,verify", ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, { "skein", ZIO_CHECKSUM_SKEIN }, { "skein,verify", ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, #ifdef illumos { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, #endif { NULL } }; static zprop_index_t compress_table[] = { { "on", ZIO_COMPRESS_ON }, { "off", ZIO_COMPRESS_OFF }, { "lzjb", ZIO_COMPRESS_LZJB }, { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ { "gzip-1", ZIO_COMPRESS_GZIP_1 }, { "gzip-2", ZIO_COMPRESS_GZIP_2 }, { "gzip-3", ZIO_COMPRESS_GZIP_3 }, { "gzip-4", ZIO_COMPRESS_GZIP_4 }, { "gzip-5", ZIO_COMPRESS_GZIP_5 }, { "gzip-6", ZIO_COMPRESS_GZIP_6 }, { "gzip-7", ZIO_COMPRESS_GZIP_7 }, { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, { NULL } }; static zprop_index_t snapdir_table[] = { { "hidden", ZFS_SNAPDIR_HIDDEN }, { "visible", ZFS_SNAPDIR_VISIBLE }, { NULL } }; static zprop_index_t acl_mode_table[] = { { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; static zprop_index_t acl_inherit_table[] = { { "discard", ZFS_ACL_DISCARD }, { "noallow", ZFS_ACL_NOALLOW }, { "restricted", ZFS_ACL_RESTRICTED }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, { NULL } }; static zprop_index_t case_table[] = { { "sensitive", ZFS_CASE_SENSITIVE }, { "insensitive", ZFS_CASE_INSENSITIVE }, { "mixed", ZFS_CASE_MIXED }, { NULL } }; static zprop_index_t copies_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { NULL } }; /* * Use the unique flags we have to send to u8_strcmp() and/or * u8_textprep() to represent the various normalization property * values. */ static zprop_index_t normalize_table[] = { { "none", 0 }, { "formD", U8_TEXTPREP_NFD }, { "formKC", U8_TEXTPREP_NFKC }, { "formC", U8_TEXTPREP_NFC }, { "formKD", U8_TEXTPREP_NFKD }, { NULL } }; static zprop_index_t version_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; static zprop_index_t boolean_table[] = { { "off", 0 }, { "on", 1 }, { NULL } }; static zprop_index_t logbias_table[] = { { "latency", ZFS_LOGBIAS_LATENCY }, { "throughput", ZFS_LOGBIAS_THROUGHPUT }, { NULL } }; static zprop_index_t canmount_table[] = { { "off", ZFS_CANMOUNT_OFF }, { "on", ZFS_CANMOUNT_ON }, { "noauto", ZFS_CANMOUNT_NOAUTO }, { NULL } }; static zprop_index_t cache_table[] = { { "none", ZFS_CACHE_NONE }, { "metadata", ZFS_CACHE_METADATA }, { "all", ZFS_CACHE_ALL }, { NULL } }; static zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, { "disabled", ZFS_SYNC_DISABLED }, { NULL } }; static zprop_index_t volmode_table[] = { { "default", ZFS_VOLMODE_DEFAULT }, { "geom", ZFS_VOLMODE_GEOM }, { "dev", ZFS_VOLMODE_DEV }, { "none", ZFS_VOLMODE_NONE }, { NULL } }; static zprop_index_t dnsize_table[] = { { "legacy", ZFS_DNSIZE_LEGACY }, { "auto", ZFS_DNSIZE_AUTO }, { "1k", ZFS_DNSIZE_1K }, { "2k", ZFS_DNSIZE_2K }, { "4k", ZFS_DNSIZE_4K }, { "8k", ZFS_DNSIZE_8K }, { "16k", ZFS_DNSIZE_16K }, { NULL } }; static zprop_index_t redundant_metadata_table[] = { { "all", ZFS_REDUNDANT_METADATA_ALL }, { "most", ZFS_REDUNDANT_METADATA_MOST }, { NULL } }; /* inherit index properties */ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", ZFS_REDUNDANT_METADATA_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "all | most", "REDUND_MD", redundant_metadata_table); zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "standard | always | disabled", "SYNC", sync_table); zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256 | sha512 | " "skein", "CHECKSUM", checksum_table); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify], sha512[,verify], " "skein[,verify]", "DEDUP", dedup_table); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | groupmask | passthrough | restricted", "ACLMODE", acl_mode_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", "ACLINHERIT", acl_inherit_table); zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "1 | 2 | 3", "COPIES", copies_table); zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "PRIMARYCACHE", cache_table); zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); zprop_register_index(ZFS_PROP_VOLMODE, "volmode", ZFS_VOLMODE_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "default | geom | dev | none", "VOLMODE", volmode_table); - zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); - + /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", boolean_table); zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", boolean_table); zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", boolean_table); zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", boolean_table); zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); /* readonly index (boolean) properties */ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", boolean_table); /* set once index properties */ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "none | formC | formD | formKC | formKD", "NORMALIZATION", normalize_table); zprop_register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "sensitive | insensitive | mixed", "CASE", case_table); /* set once index (boolean) properties */ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "UTF8ONLY", boolean_table); /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, "receive_resume_token", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "RESUMETOK"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "USED"); zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "REFRATIO"); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "", "USERREFS"); zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "WRITTEN"); zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "LUSED"); zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "RESERV"); zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_VOLUME, "", "VOLSIZE"); zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "FSLIMIT"); zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "SSLIMIT"); zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "", "FSCOUNT"); zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "SSCOUNT"); zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATION", B_FALSE, B_TRUE, NULL); } boolean_t zfs_prop_delegatable(zfs_prop_t prop) { zprop_desc_t *pd = &zfs_prop_table[prop]; /* The mlslabel property is never delegatable. */ if (prop == ZFS_PROP_MLSLABEL) return (B_FALSE); return (pd->pd_attr != PROP_READONLY); } /* * Given a zfs dataset property name, returns the corresponding property ID. */ zfs_prop_t zfs_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. */ static int valid_char(char c) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == ':'); } /* * Returns true if this is a valid user-defined property (one with a ':'). */ boolean_t zfs_prop_user(const char *name) { int i; char c; boolean_t foundsep = B_FALSE; for (i = 0; i < strlen(name); i++) { c = name[i]; if (!valid_char(c)) return (B_FALSE); if (c == ':') foundsep = B_TRUE; } if (!foundsep) return (B_FALSE); return (B_TRUE); } /* * Returns true if this is a valid userspace-type property (one with a '@'). * Note that after the @, any character is valid (eg, another @, for SID * user@domain). */ boolean_t zfs_prop_userquota(const char *name) { zfs_userquota_prop_t prop; for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { if (strncmp(name, zfs_userquota_prop_prefixes[prop], strlen(zfs_userquota_prop_prefixes[prop])) == 0) { return (B_TRUE); } } return (B_FALSE); } /* * Returns true if this is a valid written@ property. * Note that after the @, any character is valid (eg, another @, for * written@pool/fs@origin). */ boolean_t zfs_prop_written(const char *name) { static const char *prefix = "written@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ int zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); } int zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } uint64_t zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); } /* * Returns TRUE if the property applies to any of the given dataset types. */ boolean_t zfs_prop_valid_for_type(int prop, zfs_type_t types) { return (zprop_valid_for_type(prop, types)); } zprop_type_t zfs_prop_get_type(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype); } /* * Returns TRUE if the property is readonly. */ boolean_t zfs_prop_readonly(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_READONLY || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } /* * Returns TRUE if the property is visible (not hidden). */ boolean_t zfs_prop_visible(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_visible); } /* * Returns TRUE if the property is only allowed to be set once. */ boolean_t zfs_prop_setonce(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); } const char * zfs_prop_default_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_strdefault); } uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_numdefault); } /* * Given a dataset property ID, returns the corresponding name. * Assuming the zfs dataset property ID is valid. */ const char * zfs_prop_to_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_name); } /* * Returns TRUE if the property is inheritable. */ boolean_t zfs_prop_inheritable(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } #ifndef _KERNEL /* * Returns a string describing the set of acceptable values for the given * zfs property, or NULL if it cannot be set. */ const char * zfs_prop_values(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_values); } /* * Returns TRUE if this property is a string type. Note that index types * (compression, checksum) are treated as strings in userland, even though they * are stored numerically on disk. */ int zfs_prop_is_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } /* * Returns the column header for the given property. Used only in * 'zfs list -o', but centralized here with the other property information. */ const char * zfs_prop_column_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_colname); } /* * Returns whether the given property should be displayed right-justified for * 'zfs list'. */ boolean_t zfs_prop_align_right(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_rightalign); } #endif Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 351077) @@ -1,4267 +1,4266 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include kstat_t *dbuf_ksp; typedef struct dbuf_stats { /* * Various statistics about the size of the dbuf cache. */ kstat_named_t cache_count; kstat_named_t cache_size_bytes; kstat_named_t cache_size_bytes_max; /* * Statistics regarding the bounds on the dbuf cache size. */ kstat_named_t cache_target_bytes; kstat_named_t cache_lowater_bytes; kstat_named_t cache_hiwater_bytes; /* * Total number of dbuf cache evictions that have occurred. */ kstat_named_t cache_total_evicts; /* * The distribution of dbuf levels in the dbuf cache and * the total size of all dbufs at each level. */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; /* * Statistics about the dbuf hash table. */ kstat_named_t hash_hits; kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. */ kstat_named_t hash_chains; kstat_named_t hash_chain_max; /* * Number of times a dbuf_create() discovers that a dbuf was * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; /* * Statistics about the size of the metadata dbuf cache. */ kstat_named_t metadata_cache_count; kstat_named_t metadata_cache_size_bytes; kstat_named_t metadata_cache_size_bytes_max; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ kstat_named_t metadata_cache_overflow; } dbuf_stats_t; dbuf_stats_t dbuf_stats = { { "cache_count", KSTAT_DATA_UINT64 }, { "cache_size_bytes", KSTAT_DATA_UINT64 }, { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "cache_target_bytes", KSTAT_DATA_UINT64 }, { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; #define DBUF_STAT_INCR(stat, val) \ atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); #define DBUF_STAT_DECR(stat, val) \ DBUF_STAT_INCR(stat, -(val)); #define DBUF_STAT_BUMP(stat) \ DBUF_STAT_INCR(stat, 1); #define DBUF_STAT_BUMPDOWN(stat) \ DBUF_STAT_INCR(stat, -1); #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ continue; \ } struct dbuf_hold_impl_data { /* Function arguments */ dnode_t *dh_dn; uint8_t dh_level; uint64_t dh_blkid; boolean_t dh_fail_sparse; boolean_t dh_fail_uncached; void *dh_tag; dmu_buf_impl_t **dh_dbp; /* Local variables */ dmu_buf_impl_t *dh_db; dmu_buf_impl_t *dh_parent; blkptr_t *dh_bp; int dh_err; dbuf_dirty_record_t *dh_dr; int dh_depth; }; static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); #ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp); #endif /* ! __lint */ /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * There are two dbuf caches; each dbuf can only be in one of them at a time. * * 1. Cache of metadata dbufs, to help make read-heavy administrative commands * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs * that represent the metadata that describes filesystems/snapshots/ * bookmarks/properties/etc. We only evict from this cache when we export a * pool, to short-circuit as much I/O as possible for all administrative * commands that need the metadata. There is no eviction policy for this * cache, because we try to only include types in it which would occupy a * very small amount of space per object but create a large impact on the * performance of these commands. Instead, after it reaches a maximum size * (which should only happen on very small memory systems with a very large * number of filesystem objects), we stop taking new dbufs into the * metadata cache, instead putting them in the normal dbuf cache. * * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. * * Dbufs are added to these caches once the last hold is released. If a dbuf is * later accessed and still exists in the dbuf cache, then it will be removed * from the cache and later re-added to the head of the cache. * * If a given dbuf meets the requirements for the metadata cache, it will go * there, otherwise it will be considered for the generic LRU dbuf cache. The * caches and the refcounts tracking their sizes are stored in an array indexed * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { multilist_t *cache; refcount_t size; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ uint64_t dbuf_cache_max_bytes = 0; uint64_t dbuf_metadata_cache_max_bytes = 0; /* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; int dbuf_metadata_cache_shift = 6; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ uint64_t dbuf_metadata_cache_overflow; /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elments to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ uint_t dbuf_cache_hiwater_pct = 10; uint_t dbuf_cache_lowater_pct = 10; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, &dbuf_metadata_cache_shift, 0, "dbuf metadata cache size as log2 fraction of ARC"); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; uint32_t i; blkid = db->db_blkid; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } if (i > 0) { DBUF_STAT_BUMP(hash_collisions); if (i == 1) DBUF_STAT_BUMP(hash_chains); DBUF_STAT_MAX(hash_chain_max, i); } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for * avoiding lock order reversals and limiting stack depth. * * Note that if we have a sync callback but no async callback, * it's likely that the sync callback will free the structure * containing the dbu. In that case we need to take care to not * dereference dbu after calling the sync evict func. */ boolean_t has_async = (dbu->dbu_evict_func_async != NULL); if (dbu->dbu_evict_func_sync != NULL) dbu->dbu_evict_func_sync(dbu); if (has_async) { taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, dbu, 0, &dbu->dbu_tqent); } } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * This returns whether this dbuf should be stored in the metadata cache, which * is based on whether it's from one of the dnode types that store data related * to traversing dataset hierarchies. */ static boolean_t dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) { DB_DNODE_ENTER(db); dmu_object_type_t type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); /* Check if this dbuf is one of the types we care about */ if (DMU_OT_IS_METADATA_CACHED(type)) { /* If we hit this, then we set something up wrong in dmu_ot */ ASSERT(DMU_OT_IS_METADATA(type)); /* * Sanity check for small-memory systems: don't allocate too * much memory for this purpose. */ if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > dbuf_metadata_cache_max_bytes) { dbuf_metadata_cache_overflow++; DTRACE_PROBE1(dbuf__metadata__cache__overflow, dmu_buf_impl_t *, db); return (B_FALSE); } return (B_TRUE); } return (B_FALSE); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } static inline unsigned long dbuf_cache_target_bytes(void) { return MIN(dbuf_cache_max_bytes, arc_max_bytes() >> dbuf_cache_shift); } static inline uint64_t dbuf_cache_hiwater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); } static inline uint64_t dbuf_cache_lowater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); } static inline boolean_t dbuf_cache_above_hiwater(void) { return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_hiwater_bytes()); } static inline boolean_t dbuf_cache_above_lowater(void) { return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_lowater_bytes()); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_MAX(cache_size_bytes_max, refcount_count(&dbuf_caches[DB_DBUF_CACHE].size)); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); } } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ /* ARGSUSED */ static void dbuf_evict_thread(void *unused __unused) { callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the callers context. */ static void dbuf_evict_notify(void) { /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes) { if (dbuf_cache_above_hiwater()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } } static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } else { ds->metadata_cache_size_bytes.value.ui64 = refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size); ds->cache_size_bytes.value.ui64 = refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); ds->hash_elements.value.ui64 = dbuf_hash_count; } return (0); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); /* * Setup the parameters for the dbuf caches. We set the sizes of the * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) * of the size of the ARC, respectively. If the values are set in * /etc/system and they're not greater than the size of the ARC, then * we honor that value. */ if (dbuf_cache_max_bytes == 0 || dbuf_cache_max_bytes >= arc_max_bytes()) { dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; } if (dbuf_metadata_cache_max_bytes == 0 || dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { dbuf_metadata_cache_max_bytes = arc_max_bytes() >> dbuf_metadata_cache_shift; } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { dbuf_caches[dcs].cache = multilist_create(sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); refcount_create(&dbuf_caches[dcs].size); } dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef __linux__ /* * XXX FreeBSD's SPL lacks KSTAT_TYPE_NAMED support - TODO */ dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { dbuf_ksp->ks_data = &dbuf_stats; dbuf_ksp->ks_update = dbuf_kstat_update; kstat_install(dbuf_ksp); for (i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = KSTAT_DATA_UINT64; snprintf(dbuf_stats.cache_levels_bytes[i].name, KSTAT_STRLEN, "cache_level_%d_bytes", i); dbuf_stats.cache_levels_bytes[i].data_type = KSTAT_DATA_UINT64; } } #endif } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; dbuf_stats_destroy(); for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { refcount_destroy(&dbuf_caches[dcs].size); multilist_destroy(dbuf_caches[dcs].cache); } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); dbuf_ksp = NULL; } } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (int i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ return (offset >> (dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; } else if (db->db_level == 0 && db->db_freed_in_flight) { /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); if (buf == NULL) { buf = arc_alloc_buf(db->db_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { dnode_t *dn; zbookmark_phys_t zb; arc_flags_t aflags = ARC_FLAG_NOWAIT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { /* * The bonus length stored in the dnode may be less than * the maximum available space in the bonus buffer. */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(max_bonuslen); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, db->db.db_size)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth != 0) { blkptr_t *bps = db->db.db_data; for (int i = 0; i < ((1 << DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1 << dn->dn_indblkshift); BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); } } DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); if (compress_type == ZIO_COMPRESS_OFF) { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed, we need to decompress it to * read the data. This could happen during the "zfs receive" of * a stream which is compressed and deduplicated. */ if (db->db_buf != NULL && arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { dbuf_fix_old_data(db, spa_syncing_txg(dmu_objset_spa(db->db_objset))); err = arc_decompress(db->db_buf); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); if (need_wait) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search.db_level = 0; db_search.db_blkid = start_blkid; db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ #ifdef DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED) { if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); } } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr); mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); #ifdef DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { dmu_objset_willuse_space(os, db->db.db_size, tx); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * The dn_struct_rwlock prevents db_blkptr from changing * due to a write from syncing context completing * while we are running, so we want to acquire it before * looking at db_blkptr. */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } /* * We need to hold the dn_struct_rwlock to make this assertion, * because it protects dn_phys / dn_next_nlevels from changing. */ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * If we are overwriting a dedup BP, then unless it is snapshotted, * when we get to syncing context we will need to decrement its * refcount in the DDT. Prefetch the relevant DDT block so that * syncing context won't have to wait for the i/o. */ ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); /* * Quick check for dirtyness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); dbuf_dirty_record_t *dr; for (dr = db->db_last_dirty; dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT(arc_buf_lsize(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); if (db->db.db_data != NULL) { zio_buf_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) { mutex_enter(&parent->db_mtx); dbuf_rele_and_unlock(parent, db, B_TRUE); } } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or a userused or groupused * object. */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) { *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } int nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; if (dh == NULL) { err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); } else { __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp, dh->dh_depth + 1); err = __dbuf_hold_impl(dh + 1); } if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); DBUF_STAT_BUMP(hash_insert_race); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ } dbuf_prefetch_arg_t; /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); kmem_free(dpa, sizeof (*dpa)); return; } ASSERT(zio == NULL || zio->io_error == 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } if (abuf == NULL) { kmem_free(dpa, sizeof(*dpa)); return; } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) iter_aflags |= ARC_FLAG_L2CACHE; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) return; if (dnode_block_freed(dn, blkid)) return; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) return; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) return; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ return; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } if (BP_IS_HOLE(&bp)) return; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); } #define DBUF_HOLD_IMPL_MAX_DEPTH 20 /* * Helper function for __dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). */ noinline static void dbuf_hold_copy(struct dbuf_hold_impl_data *dh) { dnode_t *dn = dh->dh_dn; dmu_buf_impl_t *db = dh->dh_db; dbuf_dirty_record_t *dr = dh->dh_dr; arc_buf_t *data = dr->dt.dl.dr_data; enum zio_compress compress_type = arc_get_compression(data); if (compress_type != ZIO_COMPRESS_OFF) { dbuf_set_data(db, arc_alloc_compressed_buf( dn->dn_objset->os_spa, db, arc_buf_size(data), arc_buf_lsize(data), compress_type)); } else { dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) { ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); dh->dh_parent = NULL; ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); *(dh->dh_dbp) = NULL; /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); if (dh->dh_db == NULL) { dh->dh_bp = NULL; if (dh->dh_fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(dh->dh_parent, ==, NULL); dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); if (dh->dh_fail_sparse) { if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) dh->dh_err = SET_ERROR(ENOENT); if (dh->dh_err) { if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); return (dh->dh_err); } } if (dh->dh_err && dh->dh_err != ENOENT) return (dh->dh_err); dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_parent, dh->dh_bp); } if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { mutex_exit(&dh->dh_db->db_mtx); return (SET_ERROR(ENOENT)); } if (dh->dh_db->db_buf != NULL) { arc_buf_access(dh->dh_db->db_buf); ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); } ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (dh->dh_db->db_level == 0 && dh->dh_db->db_blkid != DMU_BONUS_BLKID && dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { dh->dh_dr = dh->dh_db->db_data_pending; if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) dbuf_hold_copy(dh); } if (multilist_link_active(&dh->dh_db->db_cache_link)) { ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove( dbuf_caches[dh->dh_db->db_caching_status].cache, dh->dh_db); (void) refcount_remove_many( &dbuf_caches[dh->dh_db->db_caching_status].size, dh->dh_db->db.db_size, dh->dh_db); if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], dh->dh_db->db.db_size); } dh->dh_db->db_caching_status = DB_NO_CACHE; } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); *(dh->dh_dbp) = dh->dh_db; return (0); } /* * The following code preserves the recursive function dbuf_hold_impl() * but moves the local variables AND function arguments to the heap to * minimize the stack frame size. Enough space is initially allocated * on the stack for 20 levels of recursion. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { struct dbuf_hold_impl_data *dh; int error; dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, fail_uncached, tag, dbp, 0); error = __dbuf_hold_impl(dh); kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH); return (error); } static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp, int depth) { dh->dh_dn = dn; dh->dh_level = level; dh->dh_blkid = blkid; dh->dh_fail_sparse = fail_sparse; dh->dh_fail_uncached = fail_uncached; dh->dh_tag = tag; dh->dh_dbp = dbp; dh->dh_db = NULL; dh->dh_parent = NULL; dh->dh_bp = NULL; dh->dh_err = 0; dh->dh_dr = NULL; dh->dh_depth = depth; } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (db->db_blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag, B_FALSE); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. The 'evicting' * argument should be set if we are already in the dbuf-evicting code * path, in which case we don't want to recursively evict. This allows us to * avoid deeply nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else { boolean_t do_arc_evict = B_FALSE; blkptr_t bp; spa_t *spa = dmu_objset_spa(db->db_objset); if (!DBUF_IS_CACHEABLE(db) && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { do_arc_evict = B_TRUE; bp = *db->db_blkptr; } if (!DBUF_IS_CACHEABLE(db) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); dbuf_cached_state_t dcs = dbuf_include_in_metadata_cache(db) ? DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; multilist_insert(dbuf_caches[dcs].cache, db); (void) refcount_add_many(&dbuf_caches[dcs].size, db->db.db_size, db); if (dcs == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMP(metadata_cache_count); DBUF_STAT_MAX( metadata_cache_size_bytes_max, refcount_count( &dbuf_caches[dcs].size)); } else { DBUF_STAT_BUMP( cache_levels[db->db_level]); DBUF_STAT_BUMP(cache_count); DBUF_STAT_INCR( cache_levels_bytes[db->db_level], db->db.db_size); DBUF_STAT_MAX(cache_size_bytes_max, refcount_count( &dbuf_caches[dcs].size)); } mutex_exit(&db->db_mtx); if (db->db_caching_status == DB_DBUF_CACHE && !evicting) { dbuf_evict_notify(); } } if (do_arc_evict) arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait() { taskq_wait(dbu_evict_taskq); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } /* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { /* * In the previous transaction group, the bonus buffer * was entirely used to store the attributes for the * dnode which overrode the dn_spill field. However, * when adding more attributes to the file a spill * block was required to hold the extra attributes. * * Make sure to clear the garbage left in the dn_spill * field from the previous attributes in the bonus * buffer. Otherwise, after writing out the spill * block to the new allocated dva, it will free * the old block pointed to by the invalid dn_spill. */ db->db_blkptr = NULL; } dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(*datap, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); zio_buf_free(*datap, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); if (compress_type == ZIO_COMPRESS_OFF) { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); int lsize = arc_buf_lsize(*datap); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); } bcopy(db->db.db_data, (*datap)->b_data, psize); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { - dnode_phys_t *dnp = - (void *)(((char *)db->db.db_data) + i); + dnode_phys_t *dnp = db->db.db_data + i; i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); *db->db_blkptr = *bp; rw_exit(&dn->dn_struct_rwlock); } /* ARGSUSED */ /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1 << epbs) { /* * We only found holes. Grab the rwlock to prevent * anybody from reading the blocks we're about to * zero out. */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_put(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { objset_t *drica_os; uint64_t drica_blk_birth; dmu_tx_t *drica_tx; } dbuf_remap_impl_callback_arg_t; static void dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, void *arg) { dbuf_remap_impl_callback_arg_t *drica = arg; objset_t *os = drica->drica_os; spa_t *spa = dmu_objset_spa(os); dmu_tx_t *tx = drica->drica_tx; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (os == spa_meta_objset(spa)) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, size, drica->drica_blk_birth, tx); } } static void dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); dbuf_remap_impl_callback_arg_t drica; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; drica.drica_blk_birth = bp->blk_birth; drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* * The struct_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); *bp = bp_copy; rw_exit(&dn->dn_struct_rwlock); } } /* * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting * to remap a copy of every bp in the dbuf. */ boolean_t dbuf_can_remap(const dmu_buf_impl_t *db) { spa_t *spa = dmu_objset_spa(db->db_objset); blkptr_t *bp = db->db.db_data; boolean_t ret = B_FALSE; ASSERT3U(db->db_level, >, 0); ASSERT3S(db->db_state, ==, DB_CACHED); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { blkptr_t bp_copy = bp[i]; if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { ret = B_TRUE; break; } } spa_config_exit(spa, SCL_VDEV, FTAG); return (ret); } boolean_t dnode_needs_remap(const dnode_t *dn) { spa_t *spa = dmu_objset_spa(dn->dn_objset); boolean_t ret = B_FALSE; if (dn->dn_phys->dn_nlevels == 0) { return (B_FALSE); } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { ret = B_TRUE; break; } } spa_config_exit(spa, SCL_VDEV, FTAG); return (ret); } /* * Remap any existing BP's to concrete vdevs, if possible. */ static void dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(db->db_objset); ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { dbuf_remap_impl(dn, &bp[i], tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); } } } } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } dbuf_remap(dn, db, tx); } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 351077) @@ -1,440 +1,349 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include #include -/* - * Each of the concurrent object allocators will grab - * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to - * grab 128 slots, which is 4 blocks worth. This was experimentally - * determined to be the lowest value that eliminates the measurable effect - * of lock contention from this code path. - */ -int dmu_object_alloc_chunk_shift = 7; static uint64_t dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; - uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % - os->os_obj_next_percpu_len]; - int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; - int error; if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; } else { ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); } - - /* - * The "chunk" of dnodes that is assigned to a CPU-specific - * allocator needs to be at least one block's worth, to avoid - * lock contention on the dbuf. It can be at most one L1 block's - * worth, so that the "rescan after polishing off a L1's worth" - * logic below will be sure to kick in. - */ - if (dnodes_per_chunk < DNODES_PER_BLOCK) - dnodes_per_chunk = DNODES_PER_BLOCK; - if (dnodes_per_chunk > L1_dnode_count) - dnodes_per_chunk = L1_dnode_count; - - object = *cpuobj; - + + mutex_enter(&os->os_obj_lock); for (;;) { + object = os->os_obj_next; /* - * If we finished a chunk of dnodes, get a new one from - * the global allocator. + * Each time we polish off a L1 bp worth of dnodes (2^12 + * objects), move to another L1 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning at most once per txg. If we still can't + * allocate from that L1 block, search for an empty L0 + * block, which will quickly skip to the end of the + * metadnode if the no nearby L0 blocks are empty. This + * fallback avoids a pathology where full dnode blocks + * containing large dnodes appear sparse because they + * have a low blk_fill, leading to many failed + * allocation attempts. In the long term a better + * mechanism to search for sparse metadnode regions, + * such as spacemaps, could be implemented. + * + * os_scan_dnodes is set during txg sync if enough objects + * have been freed since the previous rescan to justify + * backfilling again. + * + * Note that dmu_traverse depends on the behavior that we use + * multiple blocks of the dnode object before going back to + * reuse objects. Any change to this algorithm should preserve + * that property or find another solution to the issues + * described in traverse_visitbp. */ - if ((P2PHASE(object, dnodes_per_chunk) == 0) || - (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < - dn_slots)) { - DNODE_STAT_BUMP(dnode_alloc_next_chunk); - mutex_enter(&os->os_obj_lock); - ASSERT0(P2PHASE(os->os_obj_next_chunk, - dnodes_per_chunk)); - object = os->os_obj_next_chunk; - - /* - * Each time we polish off a L1 bp worth of dnodes - * (2^12 objects), move to another L1 bp that's - * still reasonably sparse (at most 1/4 full). Look - * from the beginning at most once per txg. If we - * still can't allocate from that L1 block, search - * for an empty L0 block, which will quickly skip - * to the end of the metadnode if the no nearby L0 - * blocks are empty. This fallback avoids a - * pathology where full dnode blocks containing - * large dnodes appear sparse because they have a - * low blk_fill, leading to many failed allocation - * attempts. In the long term a better mechanism to - * search for sparse metadnode regions, such as - * spacemaps, could be implemented. - * - * os_scan_dnodes is set during txg sync if enough - * objects have been freed since the previous - * rescan to justify backfilling again. - * - * Note that dmu_traverse depends on the behavior - * that we use multiple blocks of the dnode object - * before going back to reuse objects. Any change - * to this algorithm should preserve that property - * or find another solution to the issues described - * in traverse_visitbp. - */ - if (P2PHASE(object, L1_dnode_count) == 0) { - uint64_t offset; - uint64_t blkfill; - int minlvl; - if (os->os_rescan_dnodes) { - offset = 0; - os->os_rescan_dnodes = B_FALSE; - } else { - offset = object << DNODE_SHIFT; - } - blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; - minlvl = restarted ? 1 : 2; - restarted = B_TRUE; - error = dnode_next_offset(DMU_META_DNODE(os), - DNODE_FIND_HOLE, &offset, minlvl, - blkfill, 0); - if (error == 0) { - object = offset >> DNODE_SHIFT; - } + if (P2PHASE(object, L1_dnode_count) == 0) { + uint64_t offset; + uint64_t blkfill; + int minlvl; + int error; + if (os->os_rescan_dnodes) { + offset = 0; + os->os_rescan_dnodes = B_FALSE; + } else { + offset = object << DNODE_SHIFT; } - /* - * Note: if "restarted", we may find a L0 that - * is not suitably aligned. - */ - os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + - dnodes_per_chunk; - (void) atomic_swap_64(cpuobj, object); - mutex_exit(&os->os_obj_lock); + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; + error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); + if (error == 0) + object = offset >> DNODE_SHIFT; } + os->os_obj_next = object + dn_slots; /* - * The value of (*cpuobj) before adding dn_slots is the object - * ID assigned to us. The value afterwards is the object ID - * assigned to whoever wants to do an allocation next. - */ - object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; - - /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, - dn_slots, FTAG, &dn); - if (error == 0) { - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE, 0) == 0) + os->os_obj_next = object; + else /* - * Another thread could have allocated it; check - * again now that we have the struct lock. + * Skip to next known valid starting point for a dnode. */ - if (dn->dn_type == DMU_OT_NONE) { - dnode_allocate(dn, ot, blocksize, 0, - bonustype, bonuslen, dn_slots, tx); - rw_exit(&dn->dn_struct_rwlock); - dmu_tx_add_new_object(tx, dn); - dnode_rele(dn, FTAG); - return (object); - } - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - DNODE_STAT_BUMP(dnode_alloc_race); - } - - /* - * Skip to next known valid starting point on error. This - * is the start of the next block of dnodes. - */ - if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { - object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); - DNODE_STAT_BUMP(dnode_alloc_next_block); - } - (void) atomic_swap_64(cpuobj, object); + os->os_obj_next = P2ROUNDUP(object + 1, + DNODES_PER_BLOCK); } + + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, dn_slots, tx); + mutex_exit(&os->os_obj_lock); + + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); + + return (object); } uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, - bonuslen, 0, tx)); + return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, 0, tx); } uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, - bonustype, bonuslen, 0, tx)); + return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, 0, tx); } uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, bonuslen, dnodesize, tx)); } int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, bonuslen, 0, tx)); } int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int dn_slots = dnodesize >> DNODE_SHIFT; int err; if (dn_slots == 0) dn_slots = DNODE_MIN_SLOTS; ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); - + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); return (0); } int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, bonuslen, 0, tx)); } int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int dn_slots = dnodesize >> DNODE_SHIFT; int err; - if (dn_slots == 0) - dn_slots = DNODE_MIN_SLOTS; - if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int err; ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); /* * If we don't create this free range, we'll leak indirect blocks when * we get to freeing the dnode in syncing context. */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); return (0); } /* * Return (in *objectp) the next object which is allocated (or a hole) * after *object, taking into account only objects that may have been modified * after the specified txg. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset; - uint64_t start_obj; + dmu_object_info_t doi; struct dsl_dataset *ds = os->os_dsl_dataset; + int dnodesize; int error; - if (*objectp == 0) { - start_obj = 1; - } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { - uint64_t i = *objectp + 1; - uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); - dmu_object_info_t doi; - - /* - * Scan through the remaining meta dnode block. The contents - * of each slot in the block are known so it can be quickly - * checked. If the block is exhausted without a match then - * hand off to dnode_next_offset() for further scanning. - */ - while (i <= last_obj) { - error = dmu_object_info(os, i, &doi); - if (error == ENOENT) { - if (hole) { - *objectp = i; - return (0); - } else { - i++; - } - } else if (error == EEXIST) { - i++; - } else if (error == 0) { - if (hole) { - i += doi.doi_dnodesize >> DNODE_SHIFT; - } else { - *objectp = i; - return (0); - } - } else { - return (error); - } - } - - start_obj = i; + /* + * Avoid expensive dnode hold if this dataset doesn't use large dnodes. + */ + if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + error = dmu_object_info(os, *objectp, &doi); + if (error && !(error == EINVAL && *objectp == 0)) + return (SET_ERROR(error)); + else + dnodesize = doi.doi_dnodesize; } else { - start_obj = *objectp + 1; + dnodesize = DNODE_MIN_SIZE; } - offset = start_obj << DNODE_SHIFT; + if (*objectp == 0) + offset = 1 << DNODE_SHIFT; + else + offset = (*objectp << DNODE_SHIFT) + dnodesize; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; return (error); } /* * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. * * Only for use from syncing context, on MOS objects. */ void dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, dmu_tx_t *tx) { dnode_t *dn; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); if (dn->dn_type == DMU_OTN_ZAP_METADATA) { dnode_rele(dn, FTAG); return; } ASSERT3U(dn->dn_type, ==, old_type); ASSERT0(dn->dn_maxblkid); /* * We must initialize the ZAP data before changing the type, * so that concurrent calls to *_is_zapified() can determine if * the object has been completely zapified by checking the type. */ mzap_create_impl(mos, object, 0, 0, tx); dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = DMU_OTN_ZAP_METADATA; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); spa_feature_incr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } void dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; dmu_object_type_t t; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); t = dn->dn_type; dnode_rele(dn, FTAG); if (t == DMU_OTN_ZAP_METADATA) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } VERIFY0(dmu_object_free(mos, object, tx)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 351077) @@ -1,2416 +1,2410 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; /* * Tunable to overwrite the maximum number of threads for the parallization * of dmu_objset_find_dp, needed to speed up the import of pools with many * datasets. * Default is 4 times the number of leaf vdevs. */ int dmu_find_threads = 0; /* * Backfill lower metadnode objects after this many have been freed. * Backfilling negatively impacts object creation rates, so only do it * if there are enough holes to fill. */ int dmu_rescan_dnode_threshold = 131072; static void dmu_objset_find_dp_cb(void *arg); void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } uint64_t dmu_objset_dnodesize(objset_t *os) { return (os->os_dnodesize); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(os->os_spa, newval, ZIO_COMPRESS_ON); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST); os->os_redundant_metadata = newval; } static void dnodesize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; switch (newval) { case ZFS_DNSIZE_LEGACY: os->os_dnodesize = DNODE_MIN_SIZE; break; case ZFS_DNSIZE_AUTO: /* * Choose a dnode size that will work well for most * workloads if the user specified "auto". Future code * improvements could dynamically select a dnode size * based on observed workload patterns. */ os->os_dnodesize = DNODE_MIN_SIZE * 2; break; case ZFS_DNSIZE_1K: case ZFS_DNSIZE_2K: case ZFS_DNSIZE_4K: case ZFS_DNSIZE_8K: case ZFS_DNSIZE_16K: os->os_dnodesize = newval; break; } } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size == sizeof (objset_phys_t)) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); } } /* * The hash is a CRC-based hash of the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* * The low 6 bits of the pointer don't have much entropy, because * the objset_t is larger than 2^6 bytes long. */ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>24); return (crc); } unsigned int dnode_multilist_index_func(multilist_t *ml, void *obj) { dnode_t *dn = obj; return (dnode_hash(dn->dn_objset, dn->dn_object) % multilist_get_num_sublists(ml)); } /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. */ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); #if 0 /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset * will not exist on pools older than SPA_VERSION_ORIGIN. */ if (ds != NULL && spa_get_dsl(spa) != NULL && spa_get_dsl(spa)->dp_origin_snap != NULL) { ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } #endif os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, sizeof (objset_phys_t)); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { boolean_t needlock = B_FALSE; /* * Note: it's valid to open the objset if the dataset is * long-held, in which case the pool_config lock will not * be held. */ if (!dsl_pool_config_held(dmu_objset_pool(os))) { needlock = B_TRUE; dsl_pool_config_enter(dmu_objset_pool(os), FTAG); } err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DNODESIZE), dnodesize_changed_cb, os); } } if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; } /* * These properties will be filled in by the logic in zfs_get_zplprop() * when they are queried for the first time. */ os->os_version = OBJSET_PROP_UNINITIALIZED; os->os_normalization = OBJSET_PROP_UNINITIALIZED; os->os_utf8only = OBJSET_PROP_UNINITIALIZED; os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i]), dnode_multilist_index_func); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - os->os_obj_next_percpu_len = boot_ncpus; - os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * - sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; /* * We shouldn't be doing anything with dsl_dataset_t's unless the * pool_config lock is held, or the dataset is long-held. */ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || dsl_dataset_long_held(ds)); mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } static int dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { int err; err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_own_impl(ds, type, readonly, tag, osp); dsl_pool_rele(dp, FTAG); return (err); } int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; err = dsl_dataset_own_obj(dp, obj, tag, &ds); if (err != 0) return (err); return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); dsl_pool_rele(dp, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, void *tag) { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); dsl_dataset_disown(ds, tag); VERIFY0(dsl_dataset_own(dp, name, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, void *tag) { dsl_dataset_disown(os->os_dsl_dataset, tag); } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t dn_marker; dnode_t *dn; mutex_enter(&os->os_lock); dn = list_head(&os->os_dnodes); while (dn != NULL) { /* * Skip dnodes without holds. We have to do this dance * because dnode_add_ref() only works if there is already a * hold. If the dnode has no holds, then it has no dbufs. */ if (dnode_add_ref(dn, FTAG)) { list_insert_after(&os->os_dnodes, dn, &dn_marker); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = list_next(&os->os_dnodes, &dn_marker); list_remove(&os->os_dnodes, &dn_marker); } else { dn = list_next(&os->os_dnodes, dn); } } mutex_exit(&os->os_lock); if (DMU_USERUSED_DNODE(os) != NULL) { dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } dnode_evict_dbufs(DMU_META_DNODE(os)); } /* * Objset eviction processing is split into into two pieces. * The first marks the objset as evicting, evicts any dbufs that * have a refcount of zero, and then queues up the objset for the * second phase of eviction. Once os->os_dnodes has been cleared by * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. * The second phase closes the special dnodes, dequeues the objset from * the list of those undergoing eviction, and finally frees the objset. * * NOTE: Due to asynchronous eviction processing (invocation of * dnode_buf_pageout()), it is possible for the meta dnode for the * objset to have no holds even though os->os_dnodes is not empty. */ void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) dsl_prop_unregister_all(ds, os); if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); mutex_enter(&os->os_lock); spa_evicting_os_register(os->os_spa, os); if (list_is_empty(&os->os_dnodes)) { mutex_exit(&os->os_lock); dmu_objset_evict_done(os); } else { mutex_exit(&os->os_lock); } } void dmu_objset_evict_done(objset_t *os) { ASSERT3P(list_head(&os->os_dnodes), ==, NULL); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); - - kmem_free(os->os_obj_next_percpu, - os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); for (int i = 0; i < TXG_SIZE; i++) { multilist_destroy(os->os_dirty_dnodes[i]); } spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { int levels = 1; /* * Determine the number of levels necessary for the meta-dnode * to contain DN_MAX_OBJECT dnodes. Note that in order to * ensure that we do not overflow 64 bits, there has to be * a nlevels that gives us a number of blocks > DN_MAX_OBJECT * but < 2^64. Therefore, * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be * less than (64 - log2(DN_MAX_OBJECT)) (16). */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; if (dmu_objset_userused_enabled(os)) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; } dmu_objset_create_arg_t; /*ARGSUSED*/ static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dataset_nestcheck(doca->doca_name) != 0) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, ds, bp, doca->doca_type, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dmu_objset_create_arg_t doca; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; return (dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; } dmu_objset_clone_arg_t; /*ARGSUSED*/ static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) return (error); /* You can only clone snapshots, not the head datasets. */ if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); return (dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) { int error = 0; uint64_t object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { error = dmu_object_remap_indirects(os, object, last_removed_txg); /* * If the ZPL removed the object before we managed to dnode_hold * it, we would get an ENOENT. If the ZPL declares its intent * to remove the object (dnode_free) before we manage to * dnode_hold it, we would get an EEXIST. In either case, we * want to continue remapping the other objects in the objset; * in all other cases, we want to break early. */ if (error != 0 && error != ENOENT && error != EEXIST) { break; } } if (error == ESRCH) { error = 0; } return (error); } int dmu_objset_remap_indirects(const char *fsname) { int error = 0; objset_t *os = NULL; uint64_t last_removed_txg; uint64_t remap_start_txg; dsl_dir_t *dd; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) { return (error); } dd = dmu_objset_ds(os)->ds_dir; if (!spa_feature_is_enabled(dmu_objset_spa(os), SPA_FEATURE_OBSOLETE_COUNTS)) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ENOTSUP)); } if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } /* * If there has not been a removal, we're done. */ last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); if (last_removed_txg == -1ULL) { dmu_objset_rele(os, FTAG); return (0); } /* * If we have remapped since the last removal, we're done. */ if (dsl_dir_is_zapified(dd)) { uint64_t last_remap_txg; if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), dd->dd_object, DD_FIELD_LAST_REMAP_TXG, sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && last_remap_txg > last_removed_txg) { dmu_objset_rele(os, FTAG); return (0); } } dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); error = dmu_objset_remap_indirects_impl(os, last_removed_txg); if (error == 0) { /* * We update the last_remap_txg to be the start txg so that * we can guarantee that every block older than last_remap_txg * that can be remapped has been remapped. */ error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); return (error); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) { dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); multilist_sublist_remove(list, dn); multilist_t *newlist = dn->dn_objset->os_synced_dnodes; if (newlist != NULL) { (void) dnode_add_ref(dn, newlist); multilist_insert(newlist, dn); } dnode_sync(dn, tx); } } /* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); if (os->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); *os->os_rootbp = *bp; if (os->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); } /* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } kmem_free(bp, sizeof (*bp)); } typedef struct sync_dnodes_arg { multilist_t *sda_list; int sda_sublist_idx; multilist_t *sda_newlist; dmu_tx_t *sda_tx; } sync_dnodes_arg_t; static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; multilist_sublist_t *ms = multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, sda->sda_tx); multilist_sublist_unlock(ms); kmem_free(sda, sizeof (*sda)); } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; if (dmu_objset_userused_enabled(os)) { /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. But it may already * exist because we call dsl_dataset_sync() twice per txg. */ if (os->os_synced_dnodes == NULL) { os->os_synced_dnodes = multilist_create(sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff]), dnode_multilist_index_func); } else { ASSERT3U(os->os_synced_dnodes->ml_offset, ==, offsetof(dnode_t, dn_dirty_link[txgoff])); } } ml = os->os_dirty_dnodes[txgoff]; num_sublists = multilist_get_num_sublists(ml); for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, sync_dnodes_task, sda, 0); /* callback frees sda */ } taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while ((dr = list_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); } /* Enable dnode backfill if enough objects have been freed. */ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { os->os_rescan_dnodes = B_TRUE; os->os_freed_dnodes = 0; } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) { used_cbs[ost] = cb; } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && used_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } typedef struct userquota_node { uint64_t uqn_id; int64_t uqn_delta; avl_node_t uqn_node; } userquota_node_t; typedef struct userquota_cache { avl_tree_t uqc_user_deltas; avl_tree_t uqc_group_deltas; } userquota_cache_t; static int userquota_compare(const void *l, const void *r) { const userquota_node_t *luqn = l; const userquota_node_t *ruqn = r; if (luqn->uqn_id < ruqn->uqn_id) return (-1); if (luqn->uqn_id > ruqn->uqn_id) return (1); return (0); } static void do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { void *cookie; userquota_node_t *uqn; ASSERT(dmu_tx_is_syncing(tx)); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, &cookie)) != NULL) { /* * os_userused_lock protects against concurrent calls to * zap_increment_int(). It's needed because zap_increment_int() * is not thread-safe (i.e. not atomic). */ mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_user_deltas); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_group_deltas); } static void userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta) { userquota_node_t search = { .uqn_id = id }; avl_index_t idx; userquota_node_t *uqn = avl_find(avl, &search, &idx); if (uqn == NULL) { uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); uqn->uqn_id = id; avl_insert(avl, uqn, idx); } uqn->uqn_delta += delta; } static void do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_MIN_SIZE + used; if (subtract) delta = -delta; userquota_update_cache(&cache->uqc_user_deltas, user, delta); userquota_update_cache(&cache->uqc_group_deltas, group, delta); } } typedef struct userquota_updates_arg { objset_t *uua_os; int uua_sublist_idx; dmu_tx_t *uua_tx; } userquota_updates_arg_t; static void userquota_updates_task(void *arg) { userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; dmu_tx_t *tx = uua->uua_tx; dnode_t *dn; userquota_cache_t cache = { 0 }; multilist_sublist_t *list = multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); avl_create(&cache.uqc_user_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); avl_create(&cache.uqc_group_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); while ((dn = multilist_sublist_head(list)) != NULL) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(&cache, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(&cache, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, os->os_synced_dnodes); } do_userquota_cacheflush(os, &cache, tx); multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { int num_sublists; if (!dmu_objset_userused_enabled(os)) return; /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; /* note: caller does taskq_wait() */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, userquota_updates_task, uua, 0); /* callback frees uua */ } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr, **drp; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) break; if (dr == NULL) { data = NULL; } else { dnode_t *dn; DB_DNODE_ENTER(dr->dr_dbuf); dn = DB_DNODE(dr->dr_dbuf); if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; DB_DNODE_EXIT(dr->dr_dbuf); } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; uint64_t *user = NULL; uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } if (before) { ASSERT(data); user = &dn->dn_olduid; group = &dn->dn_oldgid; } else if (data) { user = &dn->dn_newuid; group = &dn->dn_newgid; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, user, group); /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } int dmu_objset_userspace_upgrade(objset_t *os) { uint64_t obj; int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_NORMALIZE, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } typedef struct dmu_objset_find_ctx { taskq_t *dc_tq; dsl_pool_t *dc_dp; uint64_t dc_ddobj; char *dc_ddname; /* last component of ddobj's name */ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); void *dc_arg; int dc_flags; kmutex_t *dc_error_lock; int *dc_error; } dmu_objset_find_ctx_t; static void dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { dsl_pool_t *dp = dcp->dc_dp; dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err = 0; /* don't process if there already was an error */ if (*dcp->dc_error != 0) goto out; /* * Note: passing the name (dc_ddname) here is optional, but it * improves performance because we don't need to call * zap_value_search() to determine the name. */ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); if (err != 0) goto out; /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); goto out; } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (dcp->dc_flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); dmu_objset_find_ctx_t *child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP); *child_dcp = *dcp; child_dcp->dc_ddobj = attr->za_first_integer; child_dcp->dc_ddname = spa_strdup(attr->za_name); if (dcp->dc_tq != NULL) (void) taskq_dispatch(dcp->dc_tq, dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); else dmu_objset_find_dp_impl(child_dcp); } zap_cursor_fini(&zc); } /* * Iterate over all snapshots. */ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) { dsl_dir_rele(dd, FTAG); goto out; } /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); /* * Note: we hold the dir while calling dsl_dataset_hold_obj() so * that the dir will remain cached, and we won't have to re-instantiate * it (which could be expensive due to finding its name via * zap_value_search()). */ dsl_dir_rele(dd, FTAG); if (err != 0) goto out; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); out: if (err != 0) { mutex_enter(dcp->dc_error_lock); /* only keep first error */ if (*dcp->dc_error == 0) *dcp->dc_error = err; mutex_exit(dcp->dc_error_lock); } if (dcp->dc_ddname != NULL) spa_strfree(dcp->dc_ddname); kmem_free(dcp, sizeof (*dcp)); } static void dmu_objset_find_dp_cb(void *arg) { dmu_objset_find_ctx_t *dcp = arg; dsl_pool_t *dp = dcp->dc_dp; /* * We need to get a pool_config_lock here, as there are several * asssert(pool_config_held) down the stack. Getting a lock via * dsl_pool_config_enter is risky, as it might be stalled by a * pending writer. This would deadlock, as the write lock can * only be granted when our parent thread gives up the lock. * The _prio interface gives us priority over a pending writer. */ dsl_pool_config_enter_prio(dp, FTAG); dmu_objset_find_dp_impl(dcp); dsl_pool_config_exit(dp, FTAG); } /* * Find objsets under and including ddobj, call func(ds) on each. * The order for the enumeration is completely undefined. * func is called with dsl_pool_config held. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { int error = 0; taskq_t *tq = NULL; int ntasks; dmu_objset_find_ctx_t *dcp; kmutex_t err_lock; mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); dcp->dc_tq = NULL; dcp->dc_dp = dp; dcp->dc_ddobj = ddobj; dcp->dc_ddname = NULL; dcp->dc_func = func; dcp->dc_arg = arg; dcp->dc_flags = flags; dcp->dc_error_lock = &err_lock; dcp->dc_error = &error; if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { /* * In case a write lock is held we can't make use of * parallelism, as down the stack of the worker threads * the lock is asserted via dsl_pool_config_held. * In case of a read lock this is solved by getting a read * lock in each worker thread, which isn't possible in case * of a writer lock. So we fall back to the synchronous path * here. * In the future it might be possible to get some magic into * dsl_pool_config_held in a way that it returns true for * the worker threads so that a single lock held from this * thread suffices. For now, stay single threaded. */ dmu_objset_find_dp_impl(dcp); mutex_destroy(&err_lock); return (error); } ntasks = dmu_find_threads; if (ntasks == 0) ntasks = vdev_count_leaves(dp->dp_spa) * 4; tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, INT_MAX, 0); if (tq == NULL) { kmem_free(dcp, sizeof (*dcp)); mutex_destroy(&err_lock); return (SET_ERROR(ENOMEM)); } dcp->dc_tq = tq; /* dcp will be freed by task */ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); /* * PORTING: this code relies on the property of taskq_wait to wait * until no more tasks are queued and no more tasks are active. As * we always queue new tasks from within other tasks, task_wait * reliably waits for the full recursion to finish, even though we * enqueue new tasks after taskq_wait has been called. * On platforms other than illumos, taskq_wait may not have this * property. */ taskq_wait(tq); taskq_destroy(tq); mutex_destroy(&err_lock); return (error); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } /* * Call when we think we're going to write/free space in open context to track * the amount of dirty data in the open txg, which is also the amount * of memory that can not be evicted until this txg syncs. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) { dsl_dataset_t *ds = os->os_dsl_dataset; int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 351077) @@ -1,3514 +1,3455 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #undef dump_write #define dump_write dmu_dump_write #endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ int zfs_send_set_freerecords_bit = B_TRUE; #ifdef _KERNEL TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); #endif static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ uint64_t zfs_override_estimate_recordsize = 0; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) static void byteswap_record(dmu_replay_record_t *drr); struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; }; struct send_block_record { boolean_t eos_marker; /* Marks the end of the stream */ blkptr_t bp; zbookmark_phys_t zb; uint8_t indblkshift; uint16_t datablkszsec; bqueue_node_t ln; }; static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); struct uio auio; struct iovec aiov; /* * The code does not rely on this (len being a multiple of 8). We keep * this assertion because of the corresponding assertion in * receive_read(). Keeping this assertion ensures that we do not * inadvertently break backwards compatibility (causing the assertion * in receive_read() to trigger on old software). * * Removing the assertions could be rolled into a new feature that uses * data that isn't 8-byte aligned; if the assertions were removed, a * feature flag would have to be added. */ ASSERT0(len % 8); aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = (off_t)-1; auio.uio_td = dsp->dsa_td; #ifdef _KERNEL if (dsp->dsa_fp->f_type == DTYPE_VNODE) bwillwrite(); dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, dsp->dsa_td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); dsp->dsa_err = EOPNOTSUPP; #endif mutex_enter(&ds->ds_sendstream_lock); *dsp->dsa_off += len; mutex_exit(&ds->ds_sendstream_lock); return (dsp->dsa_err); } /* * For all record types except BEGIN, fill in the checksum (overlaid in * drr_u.drr_checksum.drr_checksum). The checksum verifies everything * up to the start of the checksum itself. */ static int dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { dsp->dsa_sent_begin = B_TRUE; } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); } return (0); } /* * Fill in the drr_free struct, or perform aggregation if the previous record is * also a free record, and the two are adjacent. * * Note that we send free records even for a full send, because we want to be * able to receive a full send as a clone, which requires a list of all the free * and freeobject records that were generated on the source. */ static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); if (length != -1ULL && offset + length < offset) length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this * function is called with a -1, and only after flushing * any pending record). */ ASSERT(length != -1ULL); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; dsp->dsa_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a WRITE record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed */ if (lsize != psize) { ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); ASSERT3S(psize, >, 0); ASSERT3S(lsize, >=, psize); drrw->drr_compressiontype = BP_GET_COMPRESS(bp); drrw->drr_compressed_size = psize; payload_size = drrw->drr_compressed_size; } else { payload_size = drrw->drr_logical_size; } if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block * writes or embedded BP's, so (like * fletcher4-checkummed blocks) userland will have to * compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & ZCHECKSUM_FLAG_DEDUP) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (EINTR); dsp->dsa_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; if (dump_record(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dsp->dsa_toguid; dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); if (object < dsp->dsa_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In * this case it's unnecessary to send the dnodes prior to * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ ASSERT3U(dsp->dsa_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; if (dump_record(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { return (SET_ERROR(EINTR)); } /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } /* * This is the callback function to traverse_dataset that acts as the worker * thread for dmu_send_impl. */ /*ARGSUSED*/ static int send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { struct send_thread_arg *sta = arg; struct send_block_record *record; uint64_t record_size; int err = 0; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); if (sta->cancel) return (SET_ERROR(EINTR)); if (bp == NULL) { ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); return (0); } else if (zb->zb_level < 0) { return (0); } record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); record->eos_marker = B_FALSE; record->bp = *bp; record->zb = *zb; record->indblkshift = dnp->dn_indblkshift; record->datablkszsec = dnp->dn_datablkszsec; record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; bqueue_enqueue(&sta->q, record, record_size); return (err); } /* * This function kicks off the traverse_dataset. It also handles setting the * error code of the thread in case something goes wrong, and pushes the End of * Stream record when the traverse_dataset call has finished. If there is no * dataset to traverse, the thread immediately pushes End of Stream marker. */ static void send_traverse_thread(void *arg) { struct send_thread_arg *st_arg = arg; int err; struct send_block_record *data; if (st_arg->ds != NULL) { err = traverse_dataset_resume(st_arg->ds, st_arg->fromtxg, &st_arg->resume, st_arg->flags, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; } data = kmem_zalloc(sizeof (*data), KM_SLEEP); data->eos_marker = B_TRUE; bqueue_enqueue(&st_arg->q, data, 1); thread_exit(); } /* * This function actually handles figuring out what kind of record needs to be * dumped, reading the data (which has hopefully been prefetched), and calling * the appropriate helper function. */ static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); const blkptr_t *bp = &data->bp; const zbookmark_phys_t *zb = &data->zb; uint8_t indblkshift = data->indblkshift; uint16_t dblkszsec = data->datablkszsec; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; ASSERT3U(zb->zb_level, >=, 0); ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= dsa->dsa_resume_object); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t offset = zb->zb_blkid * span; err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); dnode_phys_t *blk = abuf->b_data; uint64_t dnobj = zb->zb_blkid * epb; for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; ASSERT0(zb->zb_level); err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from * the arc buf into chunks. */ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); /* * We should only request compressed data from the ARC if all * the following are true: * - stream compression was requested * - we aren't splitting large blocks into smaller chunks * - the data won't need to be byteswapped before sending * - this isn't an embedded block * - this isn't metadata (if receiving on a different endian * system it can be byteswapped more easily) */ boolean_t request_compressed = (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); enum zio_flag zioflags = ZIO_FLAG_CANFAIL; if (request_compressed) zioflags |= ZIO_FLAG_RAW; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, blksz); uint64_t *ptr; for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } offset = zb->zb_blkid * blksz; if (split_large_blocks) { ASSERT3U(arc_get_compression(abuf), ==, ZIO_COMPRESS_OFF); char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsa, type, zb->zb_object, offset, n, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { err = dump_write(dsa, type, zb->zb_object, offset, blksz, arc_buf_size(abuf), bp, abuf->b_data); } arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } /* * Pop the new data off the queue, and free the old data. */ static struct send_block_record * get_next_record(bqueue_t *bq, struct send_block_record *data) { struct send_block_record *tmp = bqueue_dequeue(bq); kmem_free(data, sizeof (*data)); return (tmp); } /* * Actually do the bulk of the work in a zfs send. * * Note: Releases dp using the specified tag. */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos vnode_t *vp, offset_t *off) #else struct file *fp, offset_t *off) #endif { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; struct send_thread_arg to_arg = { 0 }; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (compressok) { featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; } if ((featureflags & (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (resumeobj != 0 || resumeoff != 0) { featureflags |= DMU_BACKUP_FEATURE_RESUMING; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (zfs_send_set_freerecords_bit) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = ancestor_zb->zbm_guid; fromtxg = ancestor_zb->zbm_creation_txg; } dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); dsp->dsa_drr = drr; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; dsp->dsa_td = curthread; dsp->dsa_fp = fp; dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_featureflags = featureflags; dsp->dsa_resume_object = resumeobj; dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); void *payload = NULL; size_t payload_len = 0; if (resumeobj != 0 || resumeoff != 0) { dmu_object_info_t to_doi; err = dmu_object_info(os, resumeobj, &to_doi); if (err != 0) goto out; SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, resumeoff / to_doi.doi_data_block_size); nvlist_t *nvl = fnvlist_alloc(); fnvlist_add_uint64(nvl, "resume_object", resumeobj); fnvlist_add_uint64(nvl, "resume_offset", resumeoff); payload = fnvlist_pack(nvl, &payload_len); drr->drr_payloadlen = payload_len; fnvlist_free(nvl); } err = dump_record(dsp, payload, payload_len); fnvlist_pack_free(payload, payload_len); if (err != 0) { err = dsp->dsa_err; goto out; } err = bqueue_init(&to_arg.q, zfs_send_queue_length, offsetof(struct send_block_record, ln)); to_arg.error_code = 0; to_arg.cancel = B_FALSE; to_arg.ds = to_ds; to_arg.fromtxg = fromtxg; to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0, TS_RUN, minclsyspri); struct send_block_record *to_data; to_data = bqueue_dequeue(&to_arg.q); while (!to_data->eos_marker && err == 0) { err = do_dump(dsp, to_data); to_data = get_next_record(&to_arg.q, to_data); if (issig(JUSTLOOKING) && issig(FORREAL)) err = EINTR; } if (err != 0) { to_arg.cancel = B_TRUE; while (!to_data->eos_marker) { to_data = get_next_record(&to_arg.q, to_data); } } kmem_free(to_data, sizeof (*to_data)); bqueue_destroy(&to_arg.q); if (err == 0 && to_arg.error_code != 0) err = to_arg.error_code; if (err != 0) goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; out: mutex_enter(&to_ds->ds_sendstream_lock); list_remove(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); dsl_dataset_long_rele(to_ds, FTAG); return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { zfs_bookmark_phys_t zb; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos vnode_t *vp, offset_t *off) #else struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; int err; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ err = dsl_dataset_own(dp, tosnap, FTAG, &ds); owned = B_TRUE; } else { err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t zb; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { is_clone = B_TRUE; } if (strchr(fromsnap, '@')) { dsl_dataset_t *fromds; err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); if (err == 0) { if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, fp, off); } if (owned) dsl_dataset_disown(ds, FTAG); else dsl_dataset_rele(ds, FTAG); return (err); } static int dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err = 0; uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ uint64_t recordsize; uint64_t record_count; objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); /* Assume all (uncompressed) blocks are recordsize. */ if (zfs_override_estimate_recordsize != 0) { recordsize = zfs_override_estimate_recordsize; } else if (os->os_phys->os_type == DMU_OST_ZVOL) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); } else { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); } if (err != 0) return (err); record_count = uncompressed / recordsize; /* * If we're estimating a send size for a compressed stream, use the * compressed data size to estimate the stream size. Otherwise, use the * uncompressed data size. */ size = stream_compressed ? compressed : uncompressed; /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block. */ size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += record_count * sizeof (dmu_replay_record_t); *sizep = size; return (0); } int dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, boolean_t stream_compressed, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ if (fromds != NULL && !fromds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); /* Get compressed and uncompressed size estimates of changed data. */ if (fromds == NULL) { uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } else { uint64_t used; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &uncomp); if (err != 0) return (err); } err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, stream_compressed, sizep); /* * Add the size of the BEGIN and END records to the estimate. */ *sizep += 2 * sizeof (dmu_replay_record_t); return (err); } struct calculate_send_arg { uint64_t uncompressed; uint64_t compressed; }; /* * Simple callback used to traverse the blocks of a snapshot and sum their * uncompressed and compressed sizes. */ /* ARGSUSED */ static int dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct calculate_send_arg *space = arg; if (bp != NULL && !BP_IS_HOLE(bp)) { space->uncompressed += BP_GET_UCSIZE(bp); space->compressed += BP_GET_PSIZE(bp); } return (0); } /* * Given a desination snapshot and a TXG, calculate the approximate size of a * send stream sent from that TXG. from_txg may be zero, indicating that the * whole snapshot will be sent. */ int dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, boolean_t stream_compressed, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; struct calculate_send_arg size = { 0 }; ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* verify that from_txg is before the provided snapshot was taken */ if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { return (SET_ERROR(EXDEV)); } /* * traverse the blocks of the snapshot with birth times after * from_txg, summing their uncompressed size */ err = traverse_dataset(ds, from_txg, TRAVERSE_POST, dmu_calculate_send_traversal, &size); if (err) return (err); err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, size.compressed, stream_compressed, sizep); return (err); } typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; uint64_t drba_snapobj; } dmu_recv_begin_arg_t; static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid) { uint64_t val; int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); /* new snapshot name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) return (error); if (fromguid != 0) { dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_snapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. */ if (dsl_dataset_modified_since_snap(ds, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_snapobj = ds->ds_prev->ds_object; } dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* start from $ORIGIN@$ORIGIN, if supported */ drba->drba_snapobj = dp->dp_origin_snap != NULL ? dp->dp_origin_snap->ds_object : 0; } return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. + * feature enabled if the stream has LARGE_BLOCKS. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate large dnodes + * to smaller ones, so the pool must have the LARGE_DNODE + * feature enabled if the stream has LARGE_DNODE. + */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid); dsl_dataset_rele(ds, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || drba->drba_origin)) return (SET_ERROR(ENOENT)); /* * If we're receiving a full send as a clone, and it doesn't * contain all the necessary free records and freeobject * records, reject it. */ if (fromguid == 0 && drba->drba_origin && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } dsl_dataset_rele(origin, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; uint64_t crflags = 0; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* create temporary clone */ dsl_dataset_t *snap = NULL; if (drba->drba_snapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_snapobj, FTAG, &snap)); } dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, tx); if (drba->drba_snapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele(ds, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drba->drba_cookie->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); if (drba->drba_cookie->drc_resumable) { dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 8, 1, &drrb->drr_fromguid, tx)); } VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 8, 1, &drrb->drr_toguid, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); uint64_t one = 1; uint64_t zero = 0; VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 8, 1, &one, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; spa_history_log_internal_ds(newds, "receive", tx, ""); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error != 0) return (error); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } uint64_t val; error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * Check if the receive is still running. If so, it will be owned. * Note that nothing else can own the dataset (e.g. after the receive * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * Note: resume point will be checked when we process the first WRITE * record. */ /* check that the origin matches */ val = 0; (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds; uint64_t dsobj; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } /* clear the inconsistent flag so that we can own it */ ASSERT(DS_IS_INCONSISTENT(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsobj = ds->ds_object; dsl_dataset_rele(ds, FTAG); VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; spa_history_log_internal_ds(ds, "resume receive", tx, ""); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING) { return (dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } else { return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } } struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* * If the record is a write, pointer to the arc_buf_t containing the * payload. */ arc_buf_t *write_buf; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; /* * These three args are used to signal to the main thread that we're * done. */ kmutex_t mutex; kcondvar_t cv; boolean_t done; int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ }; struct objlist { list_t list; /* List of struct receive_objnode. */ /* * Last object looked up. Used to assert that objects are being looked * up in ascending order. */ uint64_t last_lookup; }; struct receive_objnode { list_node_t node; uint64_t object; }; struct receive_arg { objset_t *os; kthread_t *td; struct file *fp; uint64_t voff; /* The current offset in the stream */ uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. */ struct receive_record_arg *rrd; /* A record that has had its header read in, but not its payload. */ struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; int err; boolean_t byteswap; /* Sorted list of objects not to issue prefetches for. */ struct objlist ignore_objlist; }; typedef struct guid_map_entry { uint64_t guid; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; static int guid_compare(const void *arg1, const void *arg2) { const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; return (AVL_CMP(gmep1->guid, gmep2->guid)); } static void free_guid_map_onexit(void *arg) { avl_tree_t *ca = arg; void *cookie = NULL; guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { dsl_dataset_long_rele(gmep->gme_ds, gmep); dsl_dataset_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); kmem_free(ca, sizeof (avl_tree_t)); } static int restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; int error; aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = off; auio.uio_td = ra->td; #ifdef _KERNEL error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); error = EOPNOTSUPP; #endif *resid = auio.uio_resid; return (error); } static int receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ ASSERT0(len % 8); while (done < len) { ssize_t resid; ra->err = restore_bytes(ra, buf + done, len - done, ra->voff, &resid); if (resid == len - done) { /* * Note: ECKSUM indicates that the receive * was interrupted and can potentially be resumed. */ ra->err = SET_ERROR(ECKSUM); } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } ra->bytes_read += len; ASSERT3U(done, ==, len); return (0); } noinline static void byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); DO64(drr_write_byref.drr_offset); DO64(drr_write_byref.drr_length); DO64(drr_write_byref.drr_toguid); DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. drr_key.ddk_cksum); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; } if (drr->drr_type != DRR_BEGIN) { ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); } #undef DO64 #undef DO32 } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_OLD_MAX_BONUSLEN - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } static void save_resume_state(struct receive_writer_arg *rwa, uint64_t object, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (!rwa->resumable) return; /* * We use ds_resume_bytes[] != 0 to indicate that we need to * update this on disk, so it must not be 0. */ ASSERT(rwa->bytes_read != 0); /* * We only resume from write records, which have a valid * (non-meta-dnode) object number. */ ASSERT(object != 0); /* * For resuming to work correctly, we must receive records in order, * sorted by object,offset. This is checked by the callers, but * assert it here for good measure. */ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); ASSERT3U(rwa->bytes_read, >=, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object; int err; - uint8_t dn_slots = drro->drr_dn_slots != 0 ? - drro->drr_dn_slots : DNODE_MIN_SLOTS; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || - dn_slots > - (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(rwa->os, drro->drr_object, &doi); - if (err != 0 && err != ENOENT && err != EEXIST) + if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); + object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. */ if (err == 0) { int nblkptr; - object = drro->drr_object; - nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { + nblkptr < doi.doi_nblkptr) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } - } else if (err == EEXIST) { - /* - * The object requested is currently an interior slot of a - * multi-slot dnode. This will be resolved when the next txg - * is synced out, since the send stream will have told us - * to free this slot when we freed the associated dnode - * earlier in the stream. - */ - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = drro->drr_object; - } else { - /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; } - /* - * If this is a multi-slot dnode there is a chance that this - * object will expand into a slot that is already used by - * another object from the previous snapshot. We must free - * these objects before we attempt to allocate the new dnode. - */ - if (dn_slots > 1) { - boolean_t need_sync = B_FALSE; - - for (uint64_t slot = drro->drr_object + 1; - slot < drro->drr_object + dn_slots; - slot++) { - dmu_object_info_t slot_doi; - - err = dmu_object_info(rwa->os, slot, &slot_doi); - if (err == ENOENT || err == EEXIST) - continue; - else if (err != 0) - return (err); - - err = dmu_free_long_object(rwa->os, slot); - - if (err != 0) - return (err); - - need_sync = B_TRUE; - } - - if (need_sync) - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - } - tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, - dn_slots << DNODE_SHIFT, tx); + drro->drr_dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; - err = dmu_object_info(rwa->os, obj, NULL); + err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) { obj++; - continue; + continue; } else if (err != 0) { return (err); } err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); if (obj > rwa->max_object) rwa->max_object = obj; } if (next_err != ESRCH) return (next_err); return (0); } noinline static int receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, arc_buf_t *abuf) { dmu_tx_t *tx; int err; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); /* * For resuming to work, records must be in increasing order * by (object, offset). */ if (drrw->drr_object < rwa->last_object || (drrw->drr_object == rwa->last_object && drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, DRR_WRITE_PAYLOAD_SIZE(drrw)); } /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ dmu_buf_t *bonus; if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); /* * Note: If the receive fails, we want the resume stream to start * with the same record that we last successfully received (as opposed * to the next record), so that we can verify that we are * resuming from the correct location. */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); return (0); } /* * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed * streams to refer to a copy of the data that is already on the * system because it came in earlier in the stream. This function * finds the earlier copy of the data, and uses that copy instead of * data from the stream to fulfill this write. */ static int receive_write_byref(struct receive_writer_arg *rwa, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); /* * If the GUID of the referenced dataset is different from the * GUID of the target dataset, find the referenced dataset. */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { ref_os = rwa->os; } if (drrwbr->drr_object > rwa->max_object) rwa->max_object = drrwbr->drr_object; err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) return (err); tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); /* See comment in restore_write. */ save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); if (drrwe->drr_object > rwa->max_object) rwa->max_object = drrwe->drr_object; tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(rwa->os, drrwe->drr_object, drrwe->drr_offset, data, drrwe->drr_etype, drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); /* See comment in restore_write. */ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrs->drr_object > rwa->max_object) rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } dmu_buf_will_dirty(db_spill, tx); if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); bcopy(data, db_spill->db_data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrf->drr_object > rwa->max_object) rwa->max_object = drrf->drr_object; err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { if (drc->drc_resumable) { /* wait for our resume state to be written to disk */ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); (void) dsl_destroy_head(name); } } static void receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. * Allocate ra->next_rrd and read the next record's header into * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); err = receive_read(ra, len, buf); if (err != 0) return (err); receive_cksum(ra, len, buf); /* note: rrd is NULL when reading the begin record's payload */ if (ra->rrd != NULL) { ra->rrd->payload = buf; ra->rrd->payload_size = len; ra->rrd->bytes_read = ra->bytes_read; } } ra->prev_cksum = ra->cksum; ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (err); } if (ra->next_rrd->header.drr_type == DRR_BEGIN) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); } /* * Note: checksum is of everything up to but not including the * checksum itself. */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &ra->next_rrd->header); zio_cksum_t cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; zio_cksum_t *cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); } receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } static void objlist_create(struct objlist *list) { list_create(&list->list, sizeof (struct receive_objnode), offsetof(struct receive_objnode, node)); list->last_lookup = 0; } static void objlist_destroy(struct objlist *list) { for (struct receive_objnode *n = list_remove_head(&list->list); n != NULL; n = list_remove_head(&list->list)) { kmem_free(n, sizeof (*n)); } list_destroy(&list->list); } /* * This function looks through the objlist to see if the specified object number * is contained in the objlist. In the process, it will remove all object * numbers in the list that are smaller than the specified object number. Thus, * any lookup of an object number smaller than a previously looked up object * number will always return false; therefore, all lookups should be done in * ascending order. */ static boolean_t objlist_exists(struct objlist *list, uint64_t object) { struct receive_objnode *node = list_head(&list->list); ASSERT3U(object, >=, list->last_lookup); list->last_lookup = object; while (node != NULL && node->object < object) { VERIFY3P(node, ==, list_remove_head(&list->list)); kmem_free(node, sizeof (*node)); node = list_head(&list->list); } return (node != NULL && node->object == object); } /* * The objlist is a list of object numbers stored in ascending order. However, * the insertion of new object numbers does not seek out the correct location to * store a new object number; instead, it appends it to the list for simplicity. * Thus, any users must take care to only insert new object numbers in ascending * order. */ static void objlist_insert(struct objlist *list, uint64_t object) { struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); node->object = object; #ifdef ZFS_DEBUG struct receive_objnode *last_object = list_tail(&list->list); uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); ASSERT3U(node->object, >, last_objnum); #endif list_insert_tail(&list->list, node); } /* * Issue the prefetch reads for any necessary indirect blocks. * * We use the object ignore list to tell us whether or not to issue prefetches * for a given object. We do this for both correctness (in case the blocksize * of an object has changed) and performance (if the object doesn't exist, don't * needlessly try to issue prefetches). We also trim the list as we go through * the stream to prevent it from growing to an unbounded size. * * The object numbers within will always be in sorted order, and any write * records we see will also be in sorted order, but they're not sorted with * respect to each other (i.e. we can get several object records before * receiving each object's write records). As a result, once we've reached a * given object number, we can safely remove any reference to lower object * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ /* ARGSUSED */ static void receive_read_prefetch(struct receive_arg *ra, uint64_t object, uint64_t offset, uint64_t length) { if (!objlist_exists(&ra->ignore_objlist, object)) { dmu_prefetch(ra->os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } /* * Read records off the stream, issuing any necessary prefetches. */ static int receive_read_record(struct receive_arg *ra) { int err; switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); void *buf = kmem_zalloc(size, KM_SLEEP); dmu_object_info_t doi; err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } err = dmu_object_info(ra->os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { err = receive_read_payload_and_next_header(ra, 0, NULL); return (err); } case DRR_WRITE: { struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf; boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); ASSERT(!is_meta); abuf = arc_loan_compressed_buf( dmu_objset_spa(ra->os), drrw->drr_compressed_size, drrw->drr_logical_size, drrw->drr_compressiontype); } else { abuf = arc_loan_buf(dmu_objset_spa(ra->os), is_meta, drrw->drr_logical_size); } err = receive_read_payload_and_next_header(ra, DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); if (err != 0) { dmu_return_arcbuf(abuf); return (err); } ra->rrd->write_buf = abuf; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwb = &ra->rrd->header.drr_u.drr_write_byref; err = receive_read_payload_and_next_header(ra, 0, NULL); receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, drrwb->drr_length); return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &ra->rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ err = receive_read_payload_and_next_header(ra, 0, NULL); return (err); } case DRR_END: { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); err = receive_read_payload_and_next_header(ra, drrs->drr_length, buf); if (err != 0) kmem_free(buf, drrs->drr_length); return (err); } default: return (SET_ERROR(EINVAL)); } } /* * Commit the records to the pool. */ static int receive_process_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err; /* Processing in order, therefore bytes_read should be increasing. */ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; return (receive_freeobjects(rwa, drrfo)); } case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; err = receive_write(rwa, drrw, rrd->write_buf); /* if receive_write() is successful, it consumes the arc_buf */ if (err != 0) dmu_return_arcbuf(rrd->write_buf); rrd->write_buf = NULL; rrd->payload = NULL; return (err); } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwbr = &rrd->header.drr_u.drr_write_byref; return (receive_write_byref(rwa, drrwbr)); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; return (receive_free(rwa, drrf)); } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; err = receive_spill(rwa, drrs, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } default: return (SET_ERROR(EINVAL)); } } /* * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ static void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; struct receive_record_arg *rrd; for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; rrd = bqueue_dequeue(&rwa->q)) { /* * If there's an error, the main thread will stop putting things * on the queue, but we need to clear everything in it before we * can exit. */ if (rwa->err == 0) { rwa->err = receive_process_record(rwa, rrd); } else if (rrd->write_buf != NULL) { dmu_return_arcbuf(rrd->write_buf); rrd->write_buf = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } kmem_free(rrd, sizeof (*rrd)); } kmem_free(rrd, sizeof (*rrd)); mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); thread_exit(); } static int resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) { uint64_t val; objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; uint64_t dsobj = dmu_objset_id(ra->os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &resume_obj) != 0 || nvlist_lookup_uint64(begin_nvl, "resume_offset", &resume_off) != 0) { return (SET_ERROR(EINVAL)); } VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); if (resume_obj != val) return (SET_ERROR(EINVAL)); VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); if (resume_off != val) return (SET_ERROR(EINVAL)); return (0); } /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a * worker thread, read the records off the stream one by one, and issue * prefetches for any necessary indirect blocks. It will then push the records * onto an internal blocking queue. The worker thread will pull the records off * the queue, and actually write the data into the DMU. This way, the worker * thread doesn't have to wait for reads to complete, since everything it needs * (the indirect blocks) will be prefetched. * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; struct receive_arg ra = { 0 }; struct receive_writer_arg rwa = { 0 }; int featureflags; nvlist_t *begin_nvl = NULL; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.td = curthread; ra.fp = fp; ra.voff = *voffp; if (dsl_dataset_is_zapified(drc->drc_ds)) { (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (ra.bytes_read), 1, &ra.bytes_read); } objlist_create(&ra.ignore_objlist); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { ra.err = SET_ERROR(EBADF); goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); if (ra.err != 0) { cleanup_fd = -1; goto out; } if (*action_handlep == 0) { rwa.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); avl_create(rwa.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&rwa.guid_to_ds_map); if (ra.err != 0) goto out; } drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; void *payload = NULL; if (payloadlen != 0) payload = kmem_alloc(payloadlen, KM_SLEEP); err = receive_read_payload_and_next_header(&ra, payloadlen, payload); if (err != 0) { if (payloadlen != 0) kmem_free(payload, payloadlen); goto out; } if (payloadlen != 0) { err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); kmem_free(payload, payloadlen); if (err != 0) goto out; } if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(&ra, begin_nvl); if (err != 0) goto out; } (void) bqueue_init(&rwa.q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); rwa.os = ra.os; rwa.byteswap = drc->drc_byteswap; rwa.resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, TS_RUN, minclsyspri); /* * We're reading rwa.err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we * miss a write for an iteration or two of the loop, since the writer * thread will keep freeing records we send it until we send it an eos * marker. * * We can leave this loop in 3 ways: First, if rwa.err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the * first loop and ra.rrd was never allocated, or it's later, and ra.rrd * has been handed off to the writer thread who will free it. Finally, * if receive_read_record fails or we're at the end of the stream, then * we free ra.rrd and exit. */ while (rwa.err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } ASSERT3P(ra.rrd, ==, NULL); ra.rrd = ra.next_rrd; ra.next_rrd = NULL; /* Allocates and loads header into ra.next_rrd */ err = receive_read_record(&ra); if (ra.rrd->header.drr_type == DRR_END || err != 0) { kmem_free(ra.rrd, sizeof (*ra.rrd)); ra.rrd = NULL; break; } bqueue_enqueue(&rwa.q, ra.rrd, sizeof (struct receive_record_arg) + ra.rrd->payload_size); ra.rrd = NULL; } if (ra.next_rrd == NULL) ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); ra.next_rrd->eos_marker = B_TRUE; bqueue_enqueue(&rwa.q, ra.next_rrd, 1); mutex_enter(&rwa.mutex); while (!rwa.done) { cv_wait(&rwa.cv, &rwa.mutex); } mutex_exit(&rwa.mutex); /* * If we are receiving a full stream as a clone, all object IDs which * are greater than the maximum ID referenced in the stream are * by definition unused and must be freed. Note that it's possible that * we've resumed this send and the first record we received was the END * record. In that case, max_object would be 0, but we shouldn't start * freeing all objects from there; instead we should start from the * resumeobj. */ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { uint64_t obj; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) obj = 0; if (rwa.max_object > obj) obj = rwa.max_object; obj++; int free_err = 0; int next_err = 0; while (next_err == 0) { free_err = dmu_free_long_object(rwa.os, obj); if (free_err != 0 && free_err != ENOENT) break; next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); } if (err == 0) { if (free_err != 0 && free_err != ENOENT) err = free_err; else if (next_err != ESRCH) err = next_err; } } cv_destroy(&rwa.cv); mutex_destroy(&rwa.mutex); bqueue_destroy(&rwa.q); if (err == 0) err = rwa.err; out: nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* * Clean up references. If receive is not resumable, * destroy what we created, so we don't leave it in * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } *voffp = ra.voff; objlist_destroy(&ra.ignore_objlist); return (err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; drc->drc_newsnapobj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; if (dsl_dataset_has_resume_receive_state(ds)) { (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, * we can evict its bonus buffer. */ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); drc->drc_ds = NULL; } static int add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); if (err == 0) { gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); dsl_dataset_long_hold(snapds, gmep); } else kmem_free(gmep, sizeof (*gmep)); dsl_pool_rele(dp, FTAG); return (err); } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { int error; drc->drc_owner = owner; if (drc->drc_newfs) error = dmu_recv_new_end(drc); else error = dmu_recv_existing_end(drc); if (error != 0) { dmu_recv_cleanup_ds(drc); } else if (drc->drc_guid_to_ds_map != NULL) { (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, drc->drc_newsnapobj); } return (error); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 351077) @@ -1,1344 +1,1342 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); txg_verify(dp->dp_spa, txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; if (dn != NULL) { (void) refcount_add(&dn->dn_holds, tx); if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; refcount_create(&txh->txh_space_towrite); refcount_create(&txh->txh_memory_tohold); txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; list_insert_tail(&tx->tx_holds, txh); return (txh); } static dmu_tx_hold_t * dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dnode_t *dn = NULL; dmu_tx_hold_t *txh; int err; if (object != DMU_NEW_OBJECT) { err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { tx->tx_err = err; return (NULL); } } txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); if (dn != NULL) dnode_rele(dn, FTAG); return (txh); } void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ if (!dmu_tx_is_syncing(tx)) (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } /* * This function reads specified data from disk. The specified data will * be needed to perform the transaction -- i.e, it will be read after * we do dmu_tx_assign(). There are two reasons that we read the data now * (before dmu_tx_assign()): * * 1. Reading it now has potentially better performance. The transaction * has not yet been assigned, so the TXG is not held open, and also the * caller typically has less locks held when calling dmu_tx_hold_*() than * after the transaction has been assigned. This reduces the lock (and txg) * hold times, thus reducing lock contention. * * 2. It is easier for callers (primarily the ZPL) to handle i/o errors * that are detected before they start making changes to the DMU state * (i.e. now). Once the transaction has been assigned, and some DMU * state has been changed, it can be difficult to recover from an i/o * error (e.g. to undo the changes already made in memory at the DMU * layer). Typically code to do so does not exist in the caller -- it * assumes that the data has already been cached and thus i/o errors are * not possible. * * It has been observed that the i/o initiated here can be a performance * problem, and it appears to be optional, because we don't look at the * data which is read. However, removing this read would only serve to * move the work elsewhere (after the dmu_tx_assign()), where it may * have a greater impact on performance (in addition to the impact on * fault tolerance noted above). */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold_level(dn, level, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, 0, 0); if (txh == NULL) return; dnode_t *dn = txh->txh_dnode; (void) refcount_add_many(&txh->txh_space_towrite, 1ULL << dn->dn_indblkshift, FTAG); dmu_tx_count_dnode(txh); } void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead * (see dsl_pool_adjustedsize()). Therefore this function should only * be called for transactions that we expect will not cause a net increase * in the amount of space used (but it's OK if that is occasionally not true). */ void dmu_tx_mark_netfree(dmu_tx_t *tx) { tx->tx_netfree = B_TRUE; } static void dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx; dnode_t *dn; int err; tx = txh->txh_tx; ASSERT(tx->tx_txg == 0); dn = txh->txh_dnode; dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off + len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH || i > end) break; if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } (void) refcount_add_many(&txh->txh_memory_tohold, 1 << dn->dn_indblkshift, FTAG); err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } } err = zio_wait(zio); if (err != 0) { tx->tx_err = err; return; } } } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh != NULL) (void) dmu_tx_hold_free_impl(txh, off, len); } void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); if (txh != NULL) (void) dmu_tx_hold_free_impl(txh, off, len); } static void dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn; int err; ASSERT(tx->tx_txg == 0); dn = txh->txh_dnode; dmu_tx_count_dnode(txh); /* * Modifying a almost-full microzap is around the worst case (128KB) * * If it is a fat zap, the worst case would be 7*16KB=112KB: * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks */ (void) refcount_add_many(&txh->txh_space_towrite, MZAP_MAX_BLKSZ, FTAG); if (dn == NULL) return; ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 || name == NULL) { /* * This is a microzap (only one block), or we don't know * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { tx->tx_err = err; } } else { /* * Access the name so that we'll check for i/o errors to * the leaf blocks, etc. We ignore ENOENT, as this name * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; } } } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(dn != NULL); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { boolean_t match_object = B_FALSE; boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, beginblk, endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; default: ASSERT(!"bad txh_type"); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) return; /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); now = gethrtime(); min_tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); if (now > tx->tx_start + min_tx_time) return; min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, min_tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); #ifdef _KERNEL #ifdef illumos mutex_enter(&curthread->t_delay_lock); while (cv_timedwait_hires(&curthread->t_delay_cv, &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) continue; mutex_exit(&curthread->t_delay_lock); #else pause_sbt("dmu_tx_delay", nstosbt(wakeup), nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE); #endif #else hrtime_t delta = wakeup - gethrtime(); struct timespec ts; ts.tv_sec = delta / NANOSEC; ts.tv_nsec = delta % NANOSEC; (void) nanosleep(&ts, NULL); #endif } /* * This routine attempts to assign the transaction to a transaction group. * To do so, we must determine if there is sufficient free space on disk. * * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() * on it), then it is assumed that there is sufficient free space, * unless there's insufficient slop space in the pool (see the comment * above spa_slop_shift in spa_misc.c). * * If it is not a "netfree" transaction, then if the data already on disk * is over the allowed usage (e.g. quota), this will fail with EDQUOT or * ENOSPC. Otherwise, if the current rough estimate of pending changes, * plus the rough estimate of this transaction's changes, may exceed the * allowed usage, then this will fail with ERESTART, which will cause the * caller to wait for the pending changes to be written to disk (by waiting * for the next TXG to open), and then check the space usage again. * * The rough estimate of pending changes is comprised of the sum of: * * - this transaction's holds' txh_space_towrite * * - dd_tempreserved[], which is the sum of in-flight transactions' * holds' txh_space_towrite (i.e. those transactions that have called * dmu_tx_assign() but not yet called dmu_tx_commit()). * * - dd_space_towrite[], which is the amount of dirtied dbufs. * * Note that all of these values are inflated by spa_get_worst_case_asize(), * which means that we may get ERESTART well before we are actually in danger * of running out of space, but this also mitigates any small inaccuracies * in the rough estimate (e.g. txh_space_towrite doesn't take into account * indirect blocks, and dd_space_towrite[] doesn't take into account changes * to the MOS). * * Note that due to this algorithm, it is possible to exceed the allowed * usage by one transaction. Also, as we approach the allowed usage, * we will allow a very limited amount of changes into each TXG, thus * decreasing performance. */ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { spa_t *spa = tx->tx_pool->dp_spa; ASSERT0(tx->tx_txg); if (tx->tx_err) return (tx->tx_err); if (spa_suspended(spa)) { /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && !(txg_how & TXG_WAIT)) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ uint64_t towrite = 0; uint64_t tohold = 0; for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += refcount_count(&txh->txh_space_towrite); tohold += refcount_count(&txh->txh_memory_tohold); } /* needed allocation: worst-case estimate of write space */ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ uint64_t memory = towrite + tohold; if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); if (err != 0) return (err); } return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group; txg_how is a bitmask: * * If TXG_WAIT is set and the currently open txg is full, this function * will wait until there's a new txg. This should be used when no locks * are being held. With this bit set, this function will only fail if * we're truly out of space (or over quota). * * If TXG_WAIT is *not* set and we can't assign into the currently open * txg without blocking, this function will return immediately with * ERESTART. This should be used whenever locks are being held. On an * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), * and try again. * * If TXG_NOTHROTTLE is set, this indicates that this tx should not be * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { int err; ASSERT(tx->tx_txg == 0); ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); if ((txg_how & TXG_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); if (tx->tx_wait_dirty) { /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_dirty_delayed only has effect if the * caller used TX_WAIT. Otherwise they are going to * destroy this tx and try again. The common case, * zfs_write(), uses TX_WAIT. */ tx->tx_dirty_delayed = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { /* * A dnode is assigned to the quiescing txg. Wait for its * transaction to complete. */ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { /* * If we have a lot of dirty data just wait until we sync * out a TXG at which point we'll hopefully have synced * a portion of the changes. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } } static void dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); refcount_destroy_many(&txh->txh_space_towrite, refcount_count(&txh->txh_space_towrite)); refcount_destroy_many(&txh->txh_memory_tohold, refcount_count(&txh->txh_memory_tohold)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_commit(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { ASSERT(tx->tx_txg == 0); /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); dmu_tx_destroy(tx); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_head(cb_list)) != NULL) { list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { if (!sa->sa_need_attr_registration) return; for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dmu_tx_hold_t *txh; + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, + tx->tx_objset, object, THT_SPILL, 0, 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, - THT_SPILL, 0, 0); - if (txh != NULL) - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_tx_hold_bonus(tx, object); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 351077) @@ -1,2409 +1,2225 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 RackTop Systems. */ #include #include #include #include #include #include #include #include #include #include #include #include #include -dnode_stats_t dnode_stats = { - { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, - { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, - { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_allocate", KSTAT_DATA_UINT64 }, - { "dnode_reallocate", KSTAT_DATA_UINT64 }, - { "dnode_buf_evict", KSTAT_DATA_UINT64 }, - { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, - { "dnode_alloc_race", KSTAT_DATA_UINT64 }, - { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, - { "dnode_move_invalid", KSTAT_DATA_UINT64 }, - { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, - { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, - { "dnode_move_special", KSTAT_DATA_UINT64 }, - { "dnode_move_handle", KSTAT_DATA_UINT64 }, - { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, - { "dnode_move_active", KSTAT_DATA_UINT64 }, -}; - -static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ + static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, &zfs_default_bs, 0, "Default dnode block shift"); SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, &zfs_default_ibs, 0, "Default dnode indirect block shift"); #ifdef illumos #ifdef _KERNEL static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ #endif static int dbuf_compare(const void *x1, const void *x2) { const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; int cmp = AVL_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); } else if (d2->db_state == DB_SEARCH) { ASSERT3S(d1->db_state, !=, DB_SEARCH); return (1); } return (AVL_PCMP(d1, d2)); } /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { dnode_t *dn = arg; int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is * O(number of references), so don't track dn_holds. */ refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; POINTER_INVALIDATE(&dn->dn_objset); return (0); } /* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { int i; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); } ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); ASSERT(!dn->dn_have_spill); ASSERT3P(dn->dn_zio, ==, NULL); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); ASSERT0(dn->dn_oldgid); ASSERT0(dn->dn_newuid); ASSERT0(dn->dn_newgid); ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); #ifdef _KERNEL kmem_cache_set_move(dnode_cache, dnode_move); - - dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", - KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (dnode_ksp != NULL) { - dnode_ksp->ks_data = &dnode_stats; - kstat_install(dnode_ksp); - } #endif /* _KERNEL */ } void dnode_fini(void) { - if (dnode_ksp != NULL) { - kstat_delete(dnode_ksp); - dnode_ksp = NULL; - } - kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } #ifdef ZFS_DEBUG void dnode_verify(dnode_t *dn) { int drop_struct_lock = FALSE; ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(dn->dn_indblkshift, >=, 0); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + dn->dn_bonuslen, <=, max_bonuslen); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } #endif void dnode_byteswap(dnode_phys_t *dnp) { uint64_t *buf64 = (void*)&dnp->dn_blkptr; int i; if (dnp->dn_type == DMU_OT_NONE) { bzero(dnp, sizeof (dnode_phys_t)); return; } dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); /* * dn_nblkptr is only one byte, so it's OK to read it in either * byte order. We can't read dn_bouslen. */ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) buf64[i] = BSWAP_64(buf64[i]); /* * OK to check dn_bonuslen for zero, because it won't matter if * we have the wrong byte order. This is necessary because the * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { /* * Note that the bonus length calculated here may be * longer than the actual bonus buffer. This is because * we always put the bonus buffer after the last block * pointer (instead of packing it against the end of the * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); int slots = dnp->dn_extra_slots + 1; size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); - } void dnode_buf_byteswap(void *vbuf, size_t size) { int i = 0; ASSERT3U(sizeof (dnode_phys_t), ==, (1<dn_type != DMU_OT_NONE) i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; else dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; rw_exit(&dn->dn_struct_rwlock); } void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dn->dn_bonustype = newtype; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; rw_exit(&dn->dn_struct_rwlock); } void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); dnode_setdirty(dn, tx); dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; dn->dn_have_spill = B_FALSE; } static void dnode_setdblksz(dnode_t *dn, int size) { ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); #ifdef _KERNEL ASSERT(!POINTER_IS_VALID(dn->dn_objset)); #endif /* _KERNEL */ dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate * for the dnode_move() callback. */ dn->dn_object = object; dn->dn_dbuf = db; dn->dn_handle = dnh; dn->dn_phys = dnp; if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } else { dn->dn_datablksz = 0; dn->dn_datablkszsec = 0; dn->dn_datablkshift = 0; } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; dn->dn_nblkptr = dnp->dn_nblkptr; dn->dn_checksum = dnp->dn_checksum; dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); - ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); - ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); mutex_enter(&os->os_lock); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes * signifies that the special dnodes have no references from * their children (the entries in os_dnodes). This allows * dnode_destroy() to easily determine if the last child has * been removed and then complete eviction of the objset. */ if (!DMU_OBJECT_IS_SPECIAL(object)) list_insert_head(&os->os_dnodes, dn); membar_producer(); /* * Everything else must be valid before assigning dn_objset * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); - return (dn); } /* * Caller must be holding the dnode handle, which is released upon return. */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { list_remove(&os->os_dnodes, dn); complete_os_eviction = list_is_empty(&os->os_dnodes) && list_link_active(&os->os_evicting_node); } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ - if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) - zrl_remove(&dn->dn_handle->dnh_zrlock); + zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; dn->dn_have_spill = B_FALSE; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); if (complete_os_eviction) dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int i; ASSERT3U(dn_slots, >, 0); ASSERT3U(dn_slots << DNODE_SHIFT, <=, spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); if (ibs == 0) ibs = zfs_default_ibs; ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64 - " blocksize=%d ibs=%d dn_slots=%d\n", + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); - DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; dn->dn_num_slots = dn_slots; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; else { dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> SPA_BLKPTRSHIFT)); } dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; if (dn->dn_dirtyctx_firstset) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; - dnode_free_interior_slots(dn); - DNODE_STAT_BUMP(dnode_reallocate); - /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); dn->dn_id_flags = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { /* change blocksize */ ASSERT(dn->dn_maxblkid == 0 && (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || dnode_block_freed(dn, 0))); dnode_setdblksz(dn, blocksize); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else nblkptr = MIN(DN_MAX_NBLKPTR, 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> SPA_BLKPTRSHIFT)); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_num_slots = dn_slots; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr - 1) * sizeof (blkptr_t); + (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); } +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + #ifdef _KERNEL static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { int i; ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; ndn->dn_object = odn->dn_object; ndn->dn_dbuf = odn->dn_dbuf; ndn->dn_handle = odn->dn_handle; ndn->dn_phys = odn->dn_phys; ndn->dn_type = odn->dn_type; ndn->dn_bonuslen = odn->dn_bonuslen; ndn->dn_bonustype = odn->dn_bonustype; ndn->dn_nblkptr = odn->dn_nblkptr; ndn->dn_checksum = odn->dn_checksum; ndn->dn_compress = odn->dn_compress; ndn->dn_nlevels = odn->dn_nlevels; ndn->dn_indblkshift = odn->dn_indblkshift; ndn->dn_datablkshift = odn->dn_datablkshift; ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; - ndn->dn_num_slots = odn->dn_num_slots; bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], sizeof (odn->dn_next_nlevels)); bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], sizeof (odn->dn_next_indblkshift)); bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], sizeof (odn->dn_next_bonustype)); bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], sizeof (odn->dn_rm_spillblk)); bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; ndn->dn_oldused = odn->dn_oldused; ndn->dn_oldflags = odn->dn_oldflags; ndn->dn_olduid = odn->dn_olduid; ndn->dn_oldgid = odn->dn_oldgid; ndn->dn_newuid = odn->dn_newuid; ndn->dn_newgid = odn->dn_newgid; ndn->dn_id_flags = odn->dn_id_flags; dmu_zfetch_init(&ndn->dn_zfetch, NULL); list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; /* * Update back pointers. Updating the handle fixes the back pointer of * every descendant dbuf as well as the bonus dbuf. */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; if (ndn->dn_zfetch.zf_dnode == odn) { ndn->dn_zfetch.zf_dnode = ndn; } /* * Invalidate the original dnode by clearing all of its back pointers. */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_bonus = NULL; odn->dn_zfetch.zf_dnode = NULL; /* * Set the low bit of the objset pointer to ensure that dnode_move() * recognizes the dnode as invalid in any subsequent callback. */ POINTER_INVALIDATE(&odn->dn_objset); /* * Satisfy the destructor. */ for (i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); odn->dn_free_ranges[i] = NULL; odn->dn_next_nlevels[i] = 0; odn->dn_next_indblkshift[i] = 0; odn->dn_next_bonustype[i] = 0; odn->dn_rm_spillblk[i] = 0; odn->dn_next_bonuslen[i] = 0; odn->dn_next_blksz[i] = 0; } odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; odn->dn_oldflags = 0; odn->dn_olduid = 0; odn->dn_oldgid = 0; odn->dn_newuid = 0; odn->dn_newgid = 0; odn->dn_id_flags = 0; /* * Mark the dnode. */ ndn->dn_moved = 1; odn->dn_moved = (uint8_t)-1; } #ifdef illumos /*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { dnode_t *odn = buf, *ndn = newbuf; objset_t *os; int64_t refcount; uint32_t dbufs; /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when * freeing the dnode to invalidate it, and the memory patterns written * by kmem (baddcafe and deadbeef) set at least one of the two low bits. * A newly created dnode sets the objset pointer last of all to indicate * that the dnode is known and in a valid state to be moved by this * function. */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { - DNODE_STAT_BUMP(dnode_move_invalid); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Ensure that the objset does not go away during the move. */ rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); - DNODE_STAT_BUMP(dnode_move_recheck1); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the dnode is still valid, then so is the objset. We know that no * valid objset can be freed while we hold os_lock, so we can safely * ensure that the objset remains in use. */ mutex_enter(&os->os_lock); /* * Recheck the objset pointer in case the dnode was removed just before * acquiring the lock. */ if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); - DNODE_STAT_BUMP(dnode_move_recheck2); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold os->os_lock, the dnode * cannot be freed and fields within the dnode can be safely accessed. * The objset listing this dnode cannot go away as long as this dnode is * on its list. */ rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_special); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ /* * Lock the dnode handle to prevent the dnode from obtaining any new * holds. This also prevents the descendant dbufs and the bonus dbuf * from accessing the dnode, so that we can discount their holds. The * handle is safe to access because we know that while the dnode cannot * go away, neither can its handle. Once we hold dnh_zrlock, we can * safely move any dnode referenced only by dbufs. */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_handle); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); return (KMEM_CBRC_LATER); } /* * Ensure a consistent view of the dnode's holds and the dnode's dbufs. * We need to guarantee that there is a hold for every dbuf in order to * determine whether the dnode is actively referenced. Falsely matching * a dbuf to an active hold would lead to an unsafe move. It's possible * that a thread already having an active dnode hold is about to add a * dbuf, and we can't compare hold and dbuf counts while the add is in * progress. */ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_rwlock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); return (KMEM_CBRC_LATER); } /* * A dbuf may be removed (evicted) without an active dnode hold. In that * case, the dbuf count is decremented under the handle lock before the * dbuf's hold is released. This order ensures that if we count the hold * after the dbuf is removed but before its hold is released, we will * treat the unmatched hold as active and exit safely. If we count the * hold before the dbuf is removed, the hold is discounted, and the * removal is blocked until the move completes. */ refcount = refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); dbufs = odn->dn_dbufs_count; /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, uint32_t, dbufs); if (refcount > dbufs) { rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_active); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); return (KMEM_CBRC_LATER); } rw_exit(&odn->dn_struct_rwlock); /* * At this point we know that anyone with a hold on the dnode is not * actively referencing it. The dnode is known and in a valid state to * move. We're holding the locks needed to execute the critical section. */ dnode_move_impl(odn, ndn); list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == refcount_count(&ndn->dn_holds)); ASSERT(dbufs == ndn->dn_dbufs_count); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); return (KMEM_CBRC_YES); } #endif /* illumos */ #endif /* _KERNEL */ -static void -dnode_slots_hold(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - zrl_add(&dnh->dnh_zrlock); - } -} - -static void -dnode_slots_rele(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - if (zrl_is_locked(&dnh->dnh_zrlock)) - zrl_exit(&dnh->dnh_zrlock); - else - zrl_remove(&dnh->dnh_zrlock); - } -} - -static int -dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - if (!zrl_tryenter(&dnh->dnh_zrlock)) { - for (int j = idx; j < i; j++) { - dnh = &children->dnc_children[j]; - zrl_exit(&dnh->dnh_zrlock); - } - - return (0); - } - } - - return (1); -} - -static void -dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - dnh->dnh_dnode = ptr; - } -} - -static boolean_t -dnode_check_slots_free(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - dnode_t *dn = dnh->dnh_dnode; - - if (dn == DN_SLOT_FREE) { - continue; - } else if (DN_SLOT_IS_PTR(dn)) { - mutex_enter(&dn->dn_mtx); - dmu_object_type_t type = dn->dn_type; - mutex_exit(&dn->dn_mtx); - - if (type != DMU_OT_NONE) - return (B_FALSE); - - continue; - } else { - return (B_FALSE); - } - - return (B_FALSE); - } - - return (B_TRUE); -} - -static void -dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); - - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); - dnode_destroy(dnh->dnh_dnode); - dnh->dnh_dnode = DN_SLOT_FREE; - } - } -} - void -dnode_free_interior_slots(dnode_t *dn) -{ - dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); - int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; - int idx = (dn->dn_object & (epb - 1)) + 1; - int slots = dn->dn_num_slots - 1; - - if (slots == 0) - return; - - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - while (!dnode_slots_tryenter(children, idx, slots)) - DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - - dnode_set_slots(children, idx, slots, DN_SLOT_FREE); - dnode_slots_rele(children, idx, slots); -} - -void dnode_special_close(dnode_handle_t *dnh) { dnode_t *dn = dnh->dnh_dnode; /* * Wait for final references to the dnode to clear. This can - * only happen if the arc is asynchronously evicting state that + * only happen if the arc is asyncronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; - zrl_init(&dnh->dnh_zrlock); - zrl_tryenter(&dnh->dnh_zrlock); - dn = dnode_create(os, dnp, NULL, object, dnh); + zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - - zrl_exit(&dnh->dnh_zrlock); } static void dnode_buf_evict_async(void *dbu) { - dnode_children_t *dnc = dbu; + dnode_children_t *children_dnodes = dbu; + int i; - DNODE_STAT_BUMP(dnode_buf_evict); - - for (int i = 0; i < dnc->dnc_count; i++) { - dnode_handle_t *dnh = &dnc->dnc_children[i]; + for (i = 0; i < children_dnodes->dnc_count; i++) { + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to * another valid address, so there is no need here to guard * against changes to or from NULL. */ - if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + if (dnh->dnh_dnode == NULL) { zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = DN_SLOT_UNINIT; continue; } zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - dnode_destroy(dn); /* implicit zrl_remove() for first slot */ + dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = DN_SLOT_UNINIT; + dnh->dnh_dnode = NULL; } - kmem_free(dnc, sizeof (dnode_children_t) + - dnc->dnc_count * sizeof (dnode_handle_t)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* - * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used - * to ensure the hole at the specified object offset is large enough to - * hold the dnode being created. The slots parameter is also used to ensure - * a dnode does not span multiple dnode blocks. In both of these cases, if - * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases - * are only possible when using DNODE_MUST_BE_FREE. + * Return true if the given index is interior to a dnode already + * allocated in the block. That is, the index is neither free nor + * allocated, but is consumed by a large dnode. * - * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. - * dnode_hold_impl() will check if the requested dnode is already consumed - * as an extra dnode slot by an large dnode, in which case it returns - * ENOENT. + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_consumed(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int skip; + int i; + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + for (i = 0; i < idx; i += skip) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) { + ot = dnh->dnh_dnode->dn_type; + skip = dnh->dnh_dnode->dn_num_slots; + } else { + ot = dn_block[i].dn_type; + skip = dn_block[i].dn_extra_slots + 1; + } + zrl_remove(&dnh->dnh_zrlock); + + if (ot == DMU_OT_NONE) + skip = 1; + } + + return (i > idx); +} + +/* + * Return true if the given index in the dnode block is a valid + * allocated dnode. That is, the index is not consumed by a large + * dnode and is not free. * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_allocated(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + dnh = &children_dnodes->dnc_children[idx]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[idx].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + return (ot != DMU_OT_NONE); +} + +/* + * Return true if the given range of indices in the dnode block are + * free. That is, the starting index is not consumed by a large dnode + * and none of the indices are allocated. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int i; + + if (idx + slots > DNODES_PER_BLOCK) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + for (i = idx; i < idx + slots; i++) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[i].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + if (ot != DMU_OT_NONE) + return (B_FALSE); + } + + return (B_TRUE); +} + +/* * errors: - * EINVAL - invalid object number or flags. - * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) - * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) - * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) - * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) - * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) - * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) - * EIO - i/o error error when reading the meta dnode dbuf. + * EINVAL - invalid object number. + * ENOSPC - hole too small to fulfill "slots" request + * EIO - i/o error. * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, void *tag, dnode_t **dnp) { int epb, idx, err, i; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_children_t *dnc; - dnode_phys_t *dn_block; + dnode_children_t *children_dnodes; dnode_phys_t *dn_block_begin; dnode_handle_t *dnh; ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool * which may require us to read from the root filesystem while * holding some (not all) of the locks as writer. */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); (void) refcount_add(&dn->dn_holds, tag); *dnp = dn; return (0); } if (object == 0 || object >= DN_MAX_OBJECT) return (SET_ERROR(EINVAL)); mdn = DMU_META_DNODE(os); ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { rw_enter(&mdn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) { - DNODE_STAT_BUMP(dnode_hold_dbuf_hold); + if (db == NULL) return (SET_ERROR(EIO)); - } err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { - DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); return (err); } ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; - idx = object & (epb - 1); - dn_block = (dnode_phys_t *)db->db.db_data; - ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - dnc = dmu_buf_get_user(&db->db); - dnh = NULL; - if (dnc == NULL) { + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { dnode_children_t *winner; - int skip = 0; - - dnc = kmem_zalloc(sizeof (dnode_children_t) + + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); - dnc->dnc_count = epb; - dnh = &dnc->dnc_children[0]; - - /* Initialize dnode slot status from dnode_phys_t */ - for (int i = 0; i < epb; i++) { + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - - if (skip) { - skip--; - continue; - } - - if (dn_block[i].dn_type != DMU_OT_NONE) { - int interior = dn_block[i].dn_extra_slots; - - dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); - dnode_set_slots(dnc, i + 1, interior, - DN_SLOT_INTERIOR); - skip = interior; - } else { - dnh[i].dnh_dnode = DN_SLOT_FREE; - skip = 0; - } } - - dmu_buf_init_user(&dnc->dnc_dbu, NULL, + dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, dnode_buf_evict_async, NULL); - winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); if (winner != NULL) { - for (int i = 0; i < epb; i++) + for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); + } - kmem_free(dnc, sizeof (dnode_children_t) + + kmem_free(children_dnodes, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); - dnc = winner; + children_dnodes = winner; } } + ASSERT(children_dnodes->dnc_count == epb); - ASSERT(dnc->dnc_count == epb); - dn = DN_SLOT_UNINIT; + idx = object & (epb - 1); + dn_block_begin = (dnode_phys_t *)db->db.db_data; - if (flag & DNODE_MUST_BE_ALLOCATED) { - slots = 1; - - while (dn == DN_SLOT_UNINIT) { - dnode_slots_hold(dnc, idx, slots); - dnh = &dnc->dnc_children[idx]; - - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - dn = dnh->dnh_dnode; - break; - } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { - DNODE_STAT_BUMP(dnode_hold_alloc_interior); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(EEXIST)); - } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { - DNODE_STAT_BUMP(dnode_hold_alloc_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOENT)); - } - - dnode_slots_rele(dnc, idx, slots); - if (!dnode_slots_tryenter(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - continue; - } - - /* - * Someone else won the race and called dnode_create() - * after we checked DN_SLOT_IS_PTR() above but before - * we acquired the lock. - */ - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); - dn = dnh->dnh_dnode; - } else { - dn = dnode_create(os, dn_block + idx, db, - object, dnh); - } - } - - mutex_enter(&dn->dn_mtx); - if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { - DNODE_STAT_BUMP(dnode_hold_alloc_type_none); - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOENT)); - } - - DNODE_STAT_BUMP(dnode_hold_alloc_hits); - } else if (flag & DNODE_MUST_BE_FREE) { - - if (idx + slots - 1 >= DNODES_PER_BLOCK) { - DNODE_STAT_BUMP(dnode_hold_free_overflow); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - while (dn == DN_SLOT_UNINIT) { - dnode_slots_hold(dnc, idx, slots); - - if (!dnode_check_slots_free(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - dnode_slots_rele(dnc, idx, slots); - if (!dnode_slots_tryenter(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - continue; - } - - if (!dnode_check_slots_free(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_lock_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - /* - * Allocated but otherwise free dnodes which would - * be in the interior of a multi-slot dnodes need - * to be freed. Single slot dnodes can be safely - * re-purposed as a performance optimization. - */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); - - dnh = &dnc->dnc_children[idx]; - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - dn = dnh->dnh_dnode; - } else { - dn = dnode_create(os, dn_block + idx, db, - object, dnh); - } - } - - mutex_enter(&dn->dn_mtx); - if (!refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_refcount); - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(EEXIST)); - } - - dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); - DNODE_STAT_BUMP(dnode_hold_free_hits); - } else { + if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) { dbuf_rele(db, FTAG); - return (SET_ERROR(EINVAL)); + return (ENOSPC); + } else if ((flag & DNODE_MUST_BE_ALLOCATED) && + !dnode_is_allocated(db, idx)) { + dbuf_rele(db, FTAG); + return (ENOENT); } - if (dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_txg); - type = dn->dn_type; + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + if (dn == NULL) + dn = dnode_create(os, dn_block_begin + idx, db, object, dnh); + + mutex_enter(&dn->dn_mtx); + type = dn->dn_type; + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && + (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); + zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); - return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? - ENOENT : EEXIST)); + return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST); } - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); - mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ - dnode_slots_rele(dnc, idx, slots); + zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); *dnp = dn; return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, dnp)); } /* * Can only add a reference if there is already at least one * reference on the dnode. Returns FALSE if unable to add a * new reference. */ boolean_t dnode_add_ref(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); if (refcount_is_zero(&dn->dn_holds)) { mutex_exit(&dn->dn_mtx); return (FALSE); } VERIFY(1 < refcount_add(&dn->dn_holds, tag)); mutex_exit(&dn->dn_mtx); return (TRUE); } void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag, B_FALSE); } void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ dmu_buf_impl_t *db = dn->dn_dbuf; dnode_handle_t *dnh = dn->dn_handle; refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); /* * It's unsafe to release the last hold on a dnode by dnode_rele() or * indirectly by dbuf_rele() while relying on the dnode handle to * prevent the dnode from moving, since releasing the last hold could * result in the dnode's parent dbuf evicting its dnode handles. For * that reason anyone calling dnode_rele() or dbuf_rele() without some * other direct or indirect hold on the dnode must first drop the dnode * handle. */ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { /* * Another thread could add a hold to the dnode handle in * dnode_hold_impl() while holding the parent dbuf. Since the * hold on the parent dbuf prevents the handle from being * destroyed, the hold on the handle is OK. We can't yet assert * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, dnh, evicting); } } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { dsl_dataset_dirty(os->os_dsl_dataset, tx); return; } DNODE_VERIFY(dn); #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif /* * Determine old uid/gid when necessary */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); /* * If we are already marked dirty, we're done. */ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { multilist_sublist_unlock(mls); return; } ASSERT(!refcount_is_zero(&dn->dn_holds) || !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); multilist_sublist_insert_head(mls, dn); multilist_sublist_unlock(mls); /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its * children. */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } void dnode_free(dnode_t *dn, dmu_tx_t *tx) { mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { mutex_exit(&dn->dn_mtx); return; } dn->dn_free_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); } /* * Try to change the block size for the indicated dnode. This can only * succeed if there are no blocks allocated or dirty beyond first block */ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db; int err; ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); if (ibs == dn->dn_indblkshift) ibs = 0; if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } } mutex_exit(&dn->dn_dbufs_mtx); if (ibs && dn->dn_nlevels != 1) goto fail; /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; if (ibs) { dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); fail: rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(ENOTSUP)); } /* read-holding callers must not rely on the lock being continuously held */ void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * if we have a read-lock, check to see if we need to do any work * before upgrading to a write-lock. */ if (have_read) { if (blkid <= dn->dn_maxblkid) return; if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } } if (blkid <= dn->dn_maxblkid) goto out; dn->dn_maxblkid = blkid; /* * Compute the number of levels necessary to support the new maxblkid. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; if (new_nlevels > dn->dn_nlevels) { int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; list_t *list; dbuf_dirty_record_t *new, *dr, *dr_next; dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ mutex_enter(&dn->dn_mtx); mutex_enter(&new->dt.di.dr_mtx); list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; } } mutex_exit(&new->dt.di.dr_mtx); mutex_exit(&dn->dn_mtx); } out: if (have_read) rw_downgrade(&dn->dn_struct_rwlock); } static void dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); if (db != NULL) { dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } } /* * Dirty all the in-core level-1 dbufs in the range specified by start_blkid * and end_blkid. */ static void dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t db_search; dmu_buf_impl_t *db; avl_index_t where; mutex_enter(&dn->dn_dbufs_mtx); db_search.db_level = 1; db_search.db_blkid = start_blkid + 1; db_search.db_state = DB_SEARCH; for (;;) { db = avl_find(&dn->dn_dbufs, &db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); if (db == NULL || db->db_level != 1 || db->db_blkid >= end_blkid) { break; } /* * Setup the next blkid we want to search for. */ db_search.db_blkid = db->db_blkid + 1; ASSERT3U(db->db_blkid, >=, start_blkid); /* * If the dbuf transitions to DB_EVICTING while we're trying * to dirty it, then we will be unable to discover it in * the dbuf hash table. This will result in a call to * dbuf_create() which needs to acquire the dn_dbufs_mtx * lock. To avoid a deadlock, we drop the lock before * dirtying the level-1 dbuf. */ mutex_exit(&dn->dn_dbufs_mtx); dnode_dirty_l1(dn, db->db_blkid, tx); mutex_enter(&dn->dn_dbufs_mtx); } #ifdef ZFS_DEBUG /* * Walk all the in-core level-1 dbufs and verify they have been dirtied. */ db_search.db_level = 1; db_search.db_blkid = start_blkid + 1; db_search.db_state = DB_SEARCH; db = avl_find(&dn->dn_dbufs, &db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_level != 1 || db->db_blkid >= end_blkid) break; ASSERT(db->db_dirtycnt > 0); } #endif mutex_exit(&dn->dn_dbufs_mtx); } void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } /* * First, block align the region to free: */ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. */ blkid = 0; nblks = 1; if (dn->dn_nlevels > 1) dnode_dirty_l1(dn, 0, tx); goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ goto out; } else { /* Freeing part of the block. */ head = blksz - off; ASSERT3U(head, >, 0); } blkoff = off; } /* zero out any partial block data at the start of the range */ if (head) { ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } dbuf_rele(db, FTAG); } off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) goto out; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) goto out; ASSERT(ISP2(blksz)); if (trunc) tail = 0; else tail = P2PHASE(len, blksz); ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) goto out; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); blkid = off >> blkshift; nblks = len >> blkshift; if (trunc) nblks += 1; /* * Dirty all the indirect blocks in this range. Note that only * the first and last indirect blocks can actually be written * (if they were partially freed) -- they must be dirtied, even if * they do not exist on disk yet. The interior blocks will * be freed by free_children(), so they will not actually be written. * Even though these interior blocks will not be written, we * dirty them for two reasons: * * - It ensures that the indirect blocks remain in memory until * syncing context. (They have already been prefetched by * dmu_tx_hold_free(), so we don't have to worry about reading * them serially here.) * * - The dirty space accounting will put pressure on the txg sync * mechanism to begin syncing, and to delay transactions if there * is a large amount of freeing. Even though these indirect * blocks will not be written, we could need to write the same * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { uint64_t first, last; first = blkid >> epbs; dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; if (last != first) dnode_dirty_l1(dn, last, tx); dnode_dirty_l1range(dn, first, last, tx); int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { /* * Set i to the blockid of the next non-hole * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. */ uint64_t ibyte = i << shift; int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); i = ibyte >> shift; if (i >= last) break; /* * Normally we should not see an error, either * from dnode_next_offset() or dbuf_hold_level() * (except for ESRCH from dnode_next_offset). * If there is an i/o error, then when we read * this block in syncing context, it will use * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according * to the "failmode" property. dnode_next_offset() * doesn't have a flag to indicate MUSTSUCCEED. */ if (err != 0) break; dnode_dirty_l1(dn, i, tx); } } done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", blkid, nblks, tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: rw_exit(&dn->dn_struct_rwlock); } static boolean_t dnode_spill_freed(dnode_t *dn) { int i; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); /* * If we're in the process of opening the pool, dp will not be * set yet, but there shouldn't be anything dirty. */ if (dp == NULL) return (FALSE); if (dn->dn_free_txg) return (TRUE); if (blkid == DMU_SPILL_BLKID) return (dnode_spill_freed(dn)); mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* call from syncing context when we actually write/free space for this dnode */ void dnode_diduse_space(dnode_t *dn, int64_t delta) { uint64_t space; dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", dn, dn->dn_phys, (u_longlong_t)dn->dn_phys->dn_used, (longlong_t)delta); mutex_enter(&dn->dn_mtx); space = DN_USED_BYTES(dn->dn_phys); if (delta > 0) { ASSERT3U(space + delta, >=, space); /* no overflow */ } else { ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; } mutex_exit(&dn->dn_mtx); } /* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * * If level > 0, then we are scanning an indirect block looking at its * pointers. If level == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. * Otherwise, return with *offset pointing to the beginning (if searching * forwards) or end (if searching backwards) of the range covered by the * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until * we find something (i.e., we don't return ESRCH) and then search back * down the tree (narrow the search) until we reach our original search * level. */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; boolean_t hole; int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); /* * This can only happen when we are searching up * the block tree for data. We don't really need to * adjust the offset, as we will just end up looking * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; ASSERT(dn->dn_type == DMU_OT_DNODE); ASSERT(!(flags & DNODE_FIND_BACKWARDS)); for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); i < blkfill; i += dnp[i].dn_extra_slots + 1) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; } if (i == blkfill) error = SET_ERROR(ESRCH); *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + (i << DNODE_SHIFT); } else { blkptr_t *bp = data; uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); if (hole) maxfill--; else minfill++; *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; } *offset = *offset << span; if (inc < 0) { /* traversing backwards; position offset at the end */ ASSERT3U(*offset, <=, start); *offset = MIN(*offset + (1ULL << span) - 1, start); } else if (*offset < start) { *offset = start; } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); } if (db) dbuf_rele(db, FTAG); return (error); } /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; if (!(flags & DNODE_FIND_HAVELOCK)) rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { error = SET_ERROR(ESRCH); goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = SET_ERROR(ESRCH); } goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); } /* * There's always a "virtual hole" at the end of the object, even * if all BP's which physically exist are non-holes. */ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { error = 0; } if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); return (error); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c (revision 351077) @@ -1,779 +1,777 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) { dmu_buf_impl_t *db; int txgoff = tx->tx_txg & TXG_MASK; int nblkptr = dn->dn_phys->dn_nblkptr; int old_toplvl = dn->dn_phys->dn_nlevels - 1; int new_level = dn->dn_next_nlevels[txgoff]; int i; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); ASSERT(db != NULL); dn->dn_phys->dn_nlevels = new_level; dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { dmu_buf_impl_t *child = dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); if (child == NULL) continue; #ifdef DEBUG DB_DNODE_ENTER(child); ASSERT3P(DB_DNODE(child), ==, dn); DB_DNODE_EXIT(child); #endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != &dn->dn_phys->dn_blkptr[child->db_blkid]); mutex_exit(&child->db_mtx); continue; } ASSERT(child->db_parent == NULL || child->db_parent == dn->dn_dbuf); child->db_parent = db; dbuf_add_ref(db, child); if (db->db.db_data) child->db_blkptr = (blkptr_t *)db->db.db_data + i; else child->db_blkptr = NULL; dprintf_dbuf_bp(child, child->db_blkptr, "changed db_blkptr to new indirect %s", ""); mutex_exit(&child->db_mtx); } bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); } static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (int i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); /* * Save some useful information on the holes being * punched, including logical size, type, and indirection * level. Retaining birth time enables detection of when * holes are punched for reducing the number of free * records transmitted during a zfs send. */ uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t type = BP_GET_TYPE(bp); uint64_t lvl = BP_GET_LEVEL(bp); bzero(bp, sizeof (blkptr_t)); if (spa_feature_is_active(dn->dn_objset->os_spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, type); BP_SET_LEVEL(bp, lvl); BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); } } dnode_diduse_space(dn, -bytesfreed); } #ifdef ZFS_DEBUG static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); for (i = off; i < off+num; i++) { uint64_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; int j; ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); ASSERT(child->db_level == 0); dr = child->db_last_dirty; while (dr && dr->dr_txg > txg) dr = dr->dr_next; ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ if (dr) { buf = dr->dt.dl.dr_data->b_data; for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", (void *)child, i, off, num); } } } /* * db_data better be zeroed unless it's dirty in a * future txg. */ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && child->db_last_dirty == NULL) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", (void *)child, i, off, num); } } } mutex_exit(&child->db_mtx); dbuf_rele(child, FTAG); } DB_DNODE_EXIT(db); } #endif /* * We don't usually free the indirect blocks here. If in one txg we have a * free_range and a write to the same indirect block, it's important that we * preserve the hole's birth times. Therefore, we don't free any any indirect * blocks in free_children(). If an indirect block happens to turn into all * holes, it will be freed by dbuf_write_children_ready, which happens at a * point in the syncing process where we know for certain the contents of the * indirect block. * * However, if we're freeing a dnode, its space accounting must go to zero * before we actually try to free the dnode, or we will trip an assertion. In * addition, we know the case described above cannot occur, because the dnode is * being freed. Therefore, we free the indirect blocks immediately in that * case. */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend; unsigned int epbs, shift, i; /* * There is a small possibility that this block will not be cached: * 1 - if level > 1 and there are no children with level <= 1 * 2 - if this block was evicted since we read it from * dmu_tx_hold_free(). */ if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); /* * If we modify this indirect block, and we are not freeing the * dnode (!free_indirects), then this indirect block needs to get * written to disk by dbuf_write(). If it is dirty, we know it will * be written (otherwise, we would have incorrect on-disk state * because the space would be freed but still referenced by the BP * in this indirect block). Therefore we VERIFY that it is * dirty. * * Our VERIFY covers some cases that do not actually have to be * dirty, but the open-context code happens to dirty. E.g. if the * blocks we are freeing are all holes, because in that case, we * are only freeing part of this indirect block, so it is an * ancestor of the first or last block to be freed. The first and * last L1 indirect blocks are always dirtied by dnode_free_range(). */ VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); dbuf_release_bp(db); bp = db->db.db_data; DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; if (dbstart < start) { bp += start - dbstart; } else { start = dbstart; } dbend = ((db->db_blkid + 1) << epbs) - 1; end = (blkid + nblks - 1) >> shift; if (dbend <= end) end = dbend; ASSERT3U(start, <=, end); if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); free_blocks(dn, bp, end-start+1, tx); } else { for (uint64_t id = start; id <= end; id++, bp++) { if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, id, TRUE, FALSE, FTAG, &subdb)); rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } if (free_indirects) { for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); } DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); } /* * Traverse the indicated range of the provided file * and "free" all the blocks contained there. */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; boolean_t trunc = B_FALSE; if (blkid > dn->dn_phys->dn_maxblkid) return; ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); if (blkid + nblks > dn->dn_phys->dn_maxblkid) { nblks = dn->dn_phys->dn_maxblkid - blkid + 1; trunc = B_TRUE; } /* There are no indirect blocks in the object */ if (dnlevel == 1) { if (blkid >= dn->dn_phys->dn_nblkptr) { /* this range was never made persistent */ return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); free_blocks(dn, bp + blkid, nblks, tx); } else { int shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); int start = blkid >> shift; int end = (blkid + nblks - 1) >> shift; dmu_buf_impl_t *db; ASSERT(start < dn->dn_phys->dn_nblkptr); bp += start; for (int i = start; i <= end; i++, bp++) { if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } if (trunc) { dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) { dnode_sync_free_range_arg_t *dsfra = arg; dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } /* * Try to kick all the dnode's dbufs out of the cache... */ void dnode_evict_dbufs(dnode_t *dn) { dmu_buf_impl_t db_marker; dmu_buf_impl_t *db, *db_next; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { #ifdef DEBUG DB_DNODE_ENTER(db); ASSERT3P(DB_DNODE(db), ==, dn); DB_DNODE_EXIT(db); #endif /* DEBUG */ mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING && refcount_is_zero(&db->db_holds)) { db_marker.db_level = db->db_level; db_marker.db_blkid = db->db_blkid; db_marker.db_state = DB_SEARCH; avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); /* * We need to use the "marker" dbuf rather than * simply getting the next dbuf, because * dbuf_destroy() may actually remove multiple dbufs. * It can call itself recursively on the parent dbuf, * which may also be removed from dn_dbufs. The code * flow would look like: * * dbuf_destroy(): * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): * if (!cacheable || pending_evict) * dbuf_destroy() */ dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); } else { db->db_pending_evict = TRUE; mutex_exit(&db->db_mtx); db_next = AVL_NEXT(&dn->dn_dbufs, db); } } mutex_exit(&dn->dn_dbufs_mtx); dnode_evict_bonus(dn); } void dnode_evict_bonus(dnode_t *dn) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; } } rw_exit(&dn->dn_struct_rwlock); } static void dnode_undirty_dbufs(list_t *list) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { dmu_buf_impl_t *db = dr->dr_dbuf; uint64_t txg = dr->dr_txg; if (db->db_level != 0) dnode_undirty_dbufs(&dr->dt.di.dr_children); mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); ASSERT(db->db_last_dirty == dr); db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); } else { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } static void dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; ASSERT(dmu_tx_is_syncing(tx)); /* * Our contents should have been freed in dnode_sync() by the * free range record inserted by the caller of dnode_free(). */ ASSERT0(DN_USED_BYTES(dn->dn_phys)); ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); /* * XXX - It would be nice to assert this, but we may still * have residual holds from async evictions from the arc... * * zfs_obj_to_path() also depends on this being * commented out. * * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ /* Undirty next bits */ dn->dn_next_nlevels[txgoff] = 0; dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); - dnode_free_interior_slots(dn); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_have_spill = B_FALSE; - dn->dn_num_slots = 1; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); /* * Now that we've released our hold, the dnode may * be evicted, so we musn't access it. */ } /* * Write out the dnode's dirty buffers. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) { dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; static const dnode_phys_t zerodn = { 0 }; boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); if (dmu_objset_userused_enabled(dn->dn_objset) && !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { mutex_enter(&dn->dn_mtx); dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); dn->dn_oldflags = dn->dn_phys->dn_flags; dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; mutex_exit(&dn->dn_mtx); dmu_objset_userquota_get_ids(dn, B_FALSE, tx); } else { /* Once we account for it, we should always account for it. */ ASSERT(!(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED)); } mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ dnp->dn_nlevels = 1; dnp->dn_nblkptr = dn->dn_nblkptr; } dnp->dn_type = dn->dn_type; dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } dnp->dn_extra_slots = dn->dn_num_slots - 1; ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT(dnp->dn_nlevels < 2 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); if (dn->dn_next_type[txgoff] != 0) { dnp->dn_type = dn->dn_type; dn->dn_next_type[txgoff] = 0; } if (dn->dn_next_blksz[txgoff] != 0) { ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec || !range_tree_is_empty(dn->dn_free_ranges[txgoff])); dnp->dn_datablkszsec = dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; dn->dn_next_blksz[txgoff] = 0; } if (dn->dn_next_bonuslen[txgoff] != 0) { if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) dnp->dn_bonuslen = 0; else dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; ASSERT(dnp->dn_bonuslen <= DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); dn->dn_next_bonuslen[txgoff] = 0; } if (dn->dn_next_bonustype[txgoff] != 0) { ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } boolean_t freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg; /* * Remove the spill block if we have been explicitly asked to * remove it, or if the object is being removed. */ if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) kill_spill = B_TRUE; dn->dn_rm_spillblk[txgoff] = 0; } if (dn->dn_next_indblkshift[txgoff] != 0) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; dn->dn_next_indblkshift[txgoff] = 0; } /* * Just take the live (open-context) values for checksum and compress. * Strictly speaking it's a future leak, but nothing bad happens if we * start using the new checksum or compress algorithm a little early. */ dnp->dn_checksum = dn->dn_checksum; dnp->dn_compress = dn->dn_compress; mutex_exit(&dn->dn_mtx); if (kill_spill) { free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); mutex_enter(&dn->dn_mtx); dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* process all the "freed" ranges in the file */ if (dn->dn_free_ranges[txgoff] != NULL) { dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; dsfra.dsfra_free_indirects = freeing_dnode; if (freeing_dnode) { ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], 0, dn->dn_maxblkid + 1)); } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); range_tree_destroy(dn->dn_free_ranges[txgoff]); dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); } if (freeing_dnode) { dn->dn_objset->os_freed_dnodes++; dnode_sync_free(dn, tx); return; } if (dn->dn_num_slots > DNODE_MIN_SLOTS) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; mutex_enter(&ds->ds_lock); ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = B_TRUE; mutex_exit(&ds->ds_lock); } if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; } if (dn->dn_next_nblkptr[txgoff]) { /* this should only happen on a realloc */ ASSERT(dn->dn_allocated_txg == tx->tx_txg); if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { /* zero the new blkptrs we are gaining */ bzero(dnp->dn_blkptr + dnp->dn_nblkptr, sizeof (blkptr_t) * (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); #ifdef ZFS_DEBUG } else { int i; ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); /* the blkptrs we are losing better be unallocated */ for (i = dn->dn_next_nblkptr[txgoff]; i < dnp->dn_nblkptr; i++) ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); #endif } mutex_enter(&dn->dn_mtx); dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; dn->dn_next_nblkptr[txgoff] = 0; mutex_exit(&dn->dn_mtx); } dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet * initiated the IO for the dnode's dbuf. */ } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 351077) @@ -1,2012 +1,2009 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS System attributes: * * A generic mechanism to allow for arbitrary attributes * to be stored in a dnode. The data will be stored in the bonus buffer of * the dnode and if necessary a special "spill" block will be used to handle * overflow situations. The spill block will be sized to fit the data * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the * spill block is stored at the end of the current bonus buffer. Any * attributes that would be in the way of the blkptr_t will be relocated * into the spill block. * * Attribute registration: * * Stored persistently on a per dataset basis * a mapping between attribute "string" names and their actual attribute * numeric values, length, and byteswap function. The names are only used * during registration. All attributes are known by their unique attribute * id value. If an attribute can have a variable size then the value * 0 will be used to indicate this. * * Attribute Layout: * * Attribute layouts are a way to compactly store multiple attributes, but * without taking the overhead associated with managing each attribute * individually. Since you will typically have the same set of attributes * stored in the same order a single table will be used to represent that * layout. The ZPL for example will usually have only about 10 different * layouts (regular files, device files, symlinks, * regular files + scanstamp, files/dir with extended attributes, and then * you have the possibility of all of those minus ACL, because it would * be kicked out into the spill block) * * Layouts are simply an array of the attributes and their * ordering i.e. [0, 1, 4, 5, 2] * * Each distinct layout is given a unique layout number and that is whats * stored in the header at the beginning of the SA data buffer. * * A layout only covers a single dbuf (bonus or spill). If a set of * attributes is split up between the bonus buffer and a spill buffer then * two different layouts will be used. This allows us to byteswap the * spill without looking at the bonus buffer and keeps the on disk format of * the bonus and spill buffer the same. * * Adding a single attribute will cause the entire set of attributes to * be rewritten and could result in a new layout number being constructed * as part of the rewrite if no such layout exists for the new set of * attribues. The new attribute will be appended to the end of the already * existing attributes. * * Both the attribute registration and attribute layout information are * stored in normal ZAP attributes. Their should be a small number of * known layouts and the set of attributes is assumed to typically be quite * small. * * The registered attributes and layout "table" information is maintained * in core and a special "sa_os_t" is attached to the objset_t. * * A special interface is provided to allow for quickly applying * a large set of attributes at once. sa_replace_all_by_template() is * used to set an array of attributes. This is used by the ZPL when * creating a brand new file. The template that is passed into the function * specifies the attribute, size for variable length attributes, location of * data and special "data locator" function if the data isn't in a contiguous * location. * * Byteswap implications: * * Since the SA attributes are not entirely self describing we can't do * the normal byteswap processing. The special ZAP layout attribute and * attribute registration attributes define the byteswap function and the * size of the attributes, unless it is variable sized. * The normal ZFS byteswapping infrastructure assumes you don't need * to read any objects in order to do the necessary byteswapping. Whereas * SA attributes can only be properly byteswapped if the dataset is opened * and the layout/attribute ZAP attributes are available. Because of this * the SA attributes will be byteswapped when they are first accessed by * the SA code that will read the SA data. */ typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, uint16_t length, int length_idx, boolean_t, void *userp); static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr); static void sa_idx_tab_rele(objset_t *os, void *arg); static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, int buflen); static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx); arc_byteswap_func_t *sa_bswap_table[] = { byteswap_uint64_array, byteswap_uint32_array, byteswap_uint16_array, byteswap_uint8_array, zfs_acl_byteswap, }; #define SA_COPY_DATA(f, s, t, l) \ { \ if (f == NULL) { \ if (l == 8) { \ *(uint64_t *)t = *(uint64_t *)s; \ } else if (l == 16) { \ *(uint64_t *)t = *(uint64_t *)s; \ *(uint64_t *)((uintptr_t)t + 8) = \ *(uint64_t *)((uintptr_t)s + 8); \ } else { \ bcopy(s, t, l); \ } \ } else \ sa_copy_data(f, s, t, l); \ } /* * This table is fixed and cannot be changed. Its purpose is to * allow the SA code to work with both old/new ZPL file systems. * It contains the list of legacy attributes. These attributes aren't * stored in the "attribute" registry zap objects, since older ZPL file systems * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will * use this static table. */ sa_attr_reg_t sa_legacy_attrs[] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, }; /* * This is only used for objects of type DMU_OT_ZNODE */ sa_attr_type_t sa_legacy_zpl_layout[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; /* * Special dummy layout used for buffers with no attributes. */ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static int sa_legacy_attr_count = 16; static kmem_cache_t *sa_cache = NULL; /*ARGSUSED*/ static int sa_cache_constructor(void *buf, void *unused, int kmflag) { sa_handle_t *hdl = buf; mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } /*ARGSUSED*/ static void sa_cache_destructor(void *buf, void *unused) { sa_handle_t *hdl = buf; mutex_destroy(&hdl->sa_lock); } void sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, sa_cache_destructor, NULL, NULL, NULL, 0); } void sa_cache_fini(void) { if (sa_cache) kmem_cache_destroy(sa_cache); } static int layout_num_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; return (AVL_CMP(node1->lot_num, node2->lot_num)); } static int layout_hash_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); return (AVL_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) { int i; if (count != tbf->lot_attr_count) return (1); for (i = 0; i != count; i++) { if (attrs[i] != tbf->lot_attrs[i]) return (1); } return (0); } #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) static uint64_t sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) { int i; uint64_t crc = -1ULL; for (i = 0; i != attr_count; i++) crc ^= SA_ATTR_HASH(attrs[i]); return (crc); } static int sa_get_spill(sa_handle_t *hdl) { int rc; if (hdl->sa_spill == NULL) { if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, &hdl->sa_spill)) == 0) VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } else { rc = 0; } return (rc); } /* * Main attribute lookup/update function * returns 0 for success or non zero for failures * * Operates on bulk array, first failure will abort further processing */ int sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, sa_data_op_t data_op, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; int i; int error = 0; sa_buf_type_t buftypes; buftypes = 0; ASSERT(count > 0); for (i = 0; i != count; i++) { ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); bulk[i].sa_addr = NULL; /* First check the bonus buffer */ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_bonus_tab, SA_GET_HDR(hdl, SA_BONUS), bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); if (tx && !(buftypes & SA_BONUS)) { dmu_buf_will_dirty(hdl->sa_bonus, tx); buftypes |= SA_BONUS; } } if (bulk[i].sa_addr == NULL && ((error = sa_get_spill(hdl)) == 0)) { if (TOC_ATTR_PRESENT( hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_spill_tab, SA_GET_HDR(hdl, SA_SPILL), bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); if (tx && !(buftypes & SA_SPILL) && bulk[i].sa_size == bulk[i].sa_length) { dmu_buf_will_dirty(hdl->sa_spill, tx); buftypes |= SA_SPILL; } } } if (error && error != ENOENT) { return ((error == ECKSUM) ? EIO : error); } switch (data_op) { case SA_LOOKUP: if (bulk[i].sa_addr == NULL) return (SET_ERROR(ENOENT)); if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, bulk[i].sa_size); } continue; case SA_UPDATE: /* existing rewrite of attr */ if (bulk[i].sa_addr && bulk[i].sa_size == bulk[i].sa_length) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_addr, bulk[i].sa_length); continue; } else if (bulk[i].sa_addr) { /* attr size change */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_REPLACE, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } else { /* adding new attribute */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_ADD, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } if (error) return (error); break; } } return (error); } static sa_lot_t * sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; int i; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; if (zapadd) { char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { sa->sa_layout_attr_obj = zap_create_link(os, DMU_OT_SA_ATTR_LAYOUTS, sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), "%d", (int)lot_num); VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, attr_name, 2, attr_count, attrs, tx)); } list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), offsetof(sa_idx_tab_t, sa_next)); for (i = 0; i != attr_count; i++) { if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) tb->lot_var_sizes++; } avl_add(&sa->sa_layout_num_tree, tb); /* verify we don't have a hash collision */ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { for (; findtb && findtb->lot_hash == hash; findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { if (findtb->lot_instance != tb->lot_instance) break; tb->lot_instance++; } } avl_add(&sa->sa_layout_hash_tree, tb); return (tb); } static void sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, int count, dmu_tx_t *tx, sa_lot_t **lot) { sa_lot_t *tb, tbsearch; avl_index_t loc; sa_os_t *sa = os->os_sa; boolean_t found = B_FALSE; mutex_enter(&sa->sa_lock); tbsearch.lot_hash = hash; tbsearch.lot_instance = 0; tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); if (tb) { for (; tb && tb->lot_hash == hash; tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { if (sa_layout_equal(tb, attrs, count) == 0) { found = B_TRUE; break; } } } if (!found) { tb = sa_add_layout_entry(os, attrs, count, avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); } mutex_exit(&sa->sa_lock); *lot = tb; } static int sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) { int error; uint32_t blocksize; if (size == 0) { blocksize = SPA_MINBLOCKSIZE; } else if (size > SPA_OLD_MAXBLOCKSIZE) { ASSERT(0); return (SET_ERROR(EFBIG)); } else { blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); } error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); ASSERT(error == 0); return (error); } static void sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) { if (func == NULL) { bcopy(datastart, target, buflen); } else { boolean_t start; int bytes; void *dataptr; void *saptr = target; uint32_t length; start = B_TRUE; bytes = 0; while (bytes < buflen) { func(&dataptr, &length, buflen, start, datastart); bcopy(dataptr, saptr, length); saptr = (void *)((caddr_t)saptr + length); bytes += length; start = B_FALSE; } } } /* * Determine several different sizes * first the sa header size * the number of bytes to be stored * if spill would occur the index in the attribute array is returned * * the boolean will_spill will be set when spilling is necessary. It * is only set when the buftype is SA_BONUS */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, int *total, boolean_t *will_spill) { int var_size = 0; int i; int hdrsize; int extra_hdrsize; if (buftype == SA_BONUS && sa->sa_force_spill) { *total = 0; *index = 0; *will_spill = B_TRUE; return (0); } *index = -1; *total = 0; *will_spill = B_FALSE; extra_hdrsize = 0; hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { boolean_t is_var_sz; *total = P2ROUNDUP(*total, 8); *total += attr_desc[i].sa_length; if (*will_spill) continue; is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); if (is_var_sz) { var_size++; } if (is_var_sz && var_size > 1) { /* * Don't worry that the spill block might overflow. * It will be resized if needed in sa_build_layouts(). */ if (buftype == SA_SPILL || P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + *total < full_space) { /* * Account for header space used by array of * optional sizes of variable-length attributes. * Record the extra header size in case this * increase needs to be reversed due to * spill-over. */ hdrsize += sizeof (uint16_t); if (*index != -1) extra_hdrsize += sizeof (uint16_t); } else { ASSERT(buftype == SA_BONUS); if (*index == -1) *index = i; *will_spill = B_TRUE; continue; } } /* * find index of where spill *could* occur. * Then continue to count of remainder attribute * space. The sum is used later for sizing bonus * and spill buffer. */ if (buftype == SA_BONUS && *index == -1 && (*total + P2ROUNDUP(hdrsize, 8)) > (full_space - sizeof (blkptr_t))) { *index = i; } if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && buftype == SA_BONUS) *will_spill = B_TRUE; } if (*will_spill) hdrsize -= extra_hdrsize; hdrsize = P2ROUNDUP(hdrsize, 8); return (hdrsize); } #define BUF_SPACE_NEEDED(total, header) (total + header) /* * Find layout that corresponds to ordering of attributes * If not found a new layout number is created and added to * persistent layout tables. */ static int sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; uint64_t hash; sa_buf_type_t buftype; sa_hdr_phys_t *sahdr; void *data_start; int buf_space; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; int dnodesize; int hdrsize; int spillhdrsize = 0; int used; dmu_object_type_t bonustype; sa_lot_t *lot; int len_idx; int spill_used; int bonuslen; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); bonuslen = DN_BONUS_SIZE(dnodesize); - dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); - bonuslen = DN_BONUS_SIZE(dnodesize); - /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, bonuslen, &i, &used, &spilling); if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || bonustype == DMU_OT_SA); /* setup and size spill buffer when needed */ if (spilling) { boolean_t dummy; if (hdl->sa_spill == NULL) { VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[i], attr_count - i, hdl->sa_spill, SA_SPILL, hdl->sa_spill->db_size, &i, &spill_used, &dummy); if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); buf_space = hdl->sa_spill->db_size - spillhdrsize; if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > hdl->sa_spill->db_size) VERIFY(0 == sa_resize_spill(hdl, BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); } /* setup starting pointers to lay down data */ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; buftype = SA_BONUS; if (spilling) buf_space = (sa->sa_force_spill) ? 0 : SA_BLKPTR_SPACE - hdrsize; else buf_space = hdl->sa_bonus->db_size - hdrsize; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); lot_count = 0; for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { uint16_t length; ASSERT(IS_P2ALIGNED(data_start, 8)); ASSERT(IS_P2ALIGNED(buf_space, 8)); attrs[i] = attr_desc[i].sa_attr; length = SA_REGISTERED_LEN(sa, attrs[i]); if (length == 0) length = attr_desc[i].sa_length; else VERIFY(length == attr_desc[i].sa_length); if (buf_space < length) { /* switch to spill buffer */ VERIFY(spilling); VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); SA_SET_HDR(sahdr, lot->lot_num, hdrsize); } buftype = SA_SPILL; hash = -1ULL; len_idx = 0; sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); attrs_start = &attrs[i]; buf_space = hdl->sa_spill->db_size - spillhdrsize; lot_count = 0; } hash ^= SA_ATTR_HASH(attrs[i]); attr_desc[i].sa_addr = data_start; attr_desc[i].sa_size = length; SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, data_start, length); if (sa->sa_attr_table[attrs[i]].sa_length == 0) { sahdr->sa_lengths[len_idx++] = length; } VERIFY((uintptr_t)data_start % 8 == 0); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + length), 8); buf_space -= P2ROUNDUP(length, 8); lot_count++; } sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); /* * Verify that old znodes always have layout number 0. * Must be DMU_OT_SA for arbitrary layouts */ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || (bonustype == DMU_OT_SA && lot->lot_num > 1)); if (bonustype == DMU_OT_SA) { SA_SET_HDR(sahdr, lot->lot_num, buftype == SA_BONUS ? hdrsize : spillhdrsize); } kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); hdl->sa_bonus_tab = NULL; } if (!sa->sa_force_spill) VERIFY(0 == sa_build_index(hdl, SA_BONUS)); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); if (!spilling) { /* * remove spill block that is no longer needed. */ dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; VERIFY(0 == dmu_rm_spill(hdl->sa_os, sa_handle_object(hdl), tx)); } else { VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } } return (0); } static void sa_free_attr_table(sa_os_t *sa) { int i; if (sa->sa_attr_table == NULL) return; for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_name) kmem_free(sa->sa_attr_table[i].sa_name, strlen(sa->sa_attr_table[i].sa_name) + 1); } kmem_free(sa->sa_attr_table, sizeof (sa_attr_table_t) * sa->sa_num_attrs); sa->sa_attr_table = NULL; } static int sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; uint64_t sa_reg_count = 0; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; zap_cursor_t zc; zap_attribute_t za; int registered_count = 0; int i; dmu_objset_type_t ostype = dmu_objset_type(os); sa->sa_user_table = kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); if (sa->sa_reg_attr_obj != 0) { error = zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count); /* * Make sure we retrieved a count and that it isn't zero */ if (error || (error == 0 && sa_attr_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto bail; } sa_reg_count = sa_attr_count; } if (ostype == DMU_OST_ZFS && sa_attr_count == 0) sa_attr_count += sa_legacy_attr_count; /* Allocate attribute numbers for attributes that aren't registered */ for (i = 0; i != count; i++) { boolean_t found = B_FALSE; int j; if (ostype == DMU_OST_ZFS) { for (j = 0; j != sa_legacy_attr_count; j++) { if (strcmp(reg_attrs[i].sa_name, sa_legacy_attrs[j].sa_name) == 0) { sa->sa_user_table[i] = sa_legacy_attrs[j].sa_attr; found = B_TRUE; } } } if (found) continue; if (sa->sa_reg_attr_obj) error = zap_lookup(os, sa->sa_reg_attr_obj, reg_attrs[i].sa_name, 8, 1, &attr_value); else error = SET_ERROR(ENOENT); switch (error) { case ENOENT: sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; sa_attr_count++; break; case 0: sa->sa_user_table[i] = ATTR_NUM(attr_value); break; default: goto bail; } } sa->sa_num_attrs = sa_attr_count; tb = sa->sa_attr_table = kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); /* * Attribute table is constructed from requested attribute list, * previously foreign registered attributes, and also the legacy * ZPL set of attributes. */ if (sa->sa_reg_attr_obj) { for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t value; value = za.za_first_integer; registered_count++; tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); tb[ATTR_NUM(value)].sa_registered = B_TRUE; if (tb[ATTR_NUM(value)].sa_name) { continue; } tb[ATTR_NUM(value)].sa_name = kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, strlen(za.za_name) +1); } zap_cursor_fini(&zc); /* * Make sure we processed the correct number of registered * attributes */ if (registered_count != sa_reg_count) { ASSERT(error != 0); goto bail; } } if (ostype == DMU_OST_ZFS) { for (i = 0; i != sa_legacy_attr_count; i++) { if (tb[i].sa_name) continue; tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; tb[i].sa_length = sa_legacy_attrs[i].sa_length; tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; tb[i].sa_registered = B_FALSE; tb[i].sa_name = kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, KM_SLEEP); (void) strlcpy(tb[i].sa_name, sa_legacy_attrs[i].sa_name, strlen(sa_legacy_attrs[i].sa_name) + 1); } } for (i = 0; i != count; i++) { sa_attr_type_t attr_id; attr_id = sa->sa_user_table[i]; if (tb[attr_id].sa_name) continue; tb[attr_id].sa_length = reg_attrs[i].sa_length; tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; tb[attr_id].sa_attr = attr_id; tb[attr_id].sa_name = kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, strlen(reg_attrs[i].sa_name) + 1); } sa->sa_need_attr_registration = (sa_attr_count != registered_count); return (0); bail: kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); sa->sa_user_table = NULL; sa_free_attr_table(sa); return ((error != 0) ? error : EINVAL); } int sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; sa_os_t *sa; dmu_objset_type_t ostype = dmu_objset_type(os); sa_attr_type_t *tb; int error; mutex_enter(&os->os_user_ptr_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; return (0); } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); sa->sa_master_obj = sa_obj; os->os_sa = sa; mutex_enter(&sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); if (sa_obj) { error = zap_lookup(os, sa_obj, SA_LAYOUTS, 8, 1, &sa->sa_layout_attr_obj); if (error != 0 && error != ENOENT) goto fail; error = zap_lookup(os, sa_obj, SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj); if (error != 0 && error != ENOENT) goto fail; } if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) goto fail; if (sa->sa_layout_attr_obj != 0) { uint64_t layout_count; error = zap_count(os, sa->sa_layout_attr_obj, &layout_count); /* * Layout number count should be > 0 */ if (error || (error == 0 && layout_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto fail; } for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { sa_attr_type_t *lot_attrs; uint64_t lot_num; lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * za.za_num_integers, KM_SLEEP); if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, za.za_name, 2, za.za_num_integers, lot_attrs))) != 0) { kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); break; } VERIFY(ddi_strtoull(za.za_name, NULL, 10, (unsigned long long *)&lot_num) == 0); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, sa_layout_info_hash(lot_attrs, za.za_num_integers), B_FALSE, NULL); kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); } zap_cursor_fini(&zc); /* * Make sure layout count matches number of entries added * to AVL tree */ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { ASSERT(error != 0); goto fail; } } /* Add special layout number for old ZNODES */ if (ostype == DMU_OST_ZFS) { (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, sa_legacy_attr_count, 0, sa_layout_info_hash(sa_legacy_zpl_layout, sa_legacy_attr_count), B_FALSE, NULL); (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, 0, B_FALSE, NULL); } *user_table = os->os_sa->sa_user_table; mutex_exit(&sa->sa_lock); return (0); fail: os->os_sa = NULL; sa_free_attr_table(sa); if (sa->sa_user_table) kmem_free(sa->sa_user_table, sa->sa_user_table_sz); mutex_exit(&sa->sa_lock); avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); return ((error == ECKSUM) ? EIO : error); } void sa_tear_down(objset_t *os) { sa_os_t *sa = os->os_sa; sa_lot_t *layout; void *cookie; kmem_free(sa->sa_user_table, sa->sa_user_table_sz); /* Free up attr table */ sa_free_attr_table(sa); cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { sa_idx_tab_t *tab; while (tab = list_head(&layout->lot_idx_tab)) { ASSERT(refcount_count(&tab->sa_refcount)); sa_idx_tab_rele(os, tab); } } cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { kmem_free(layout->lot_attrs, sizeof (sa_attr_type_t) * layout->lot_attr_count); kmem_free(layout, sizeof (sa_lot_t)); } avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); os->os_sa = NULL; } void sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t var_length, void *userp) { sa_idx_tab_t *idx_tab = userp; if (var_length) { ASSERT(idx_tab->sa_variable_lengths); idx_tab->sa_variable_lengths[length_idx] = length; } TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); } static void sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, sa_iterfunc_t func, sa_lot_t *tab, void *userp) { void *data_start; sa_lot_t *tb = tab; sa_lot_t search; avl_index_t loc; sa_os_t *sa = os->os_sa; int i; uint16_t *length_start = NULL; uint8_t length_idx = 0; if (tab == NULL) { search.lot_num = SA_LAYOUT_NUM(hdr, type); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ASSERT(tb); } if (IS_SA_BONUSTYPE(type)) { data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + offsetof(sa_hdr_phys_t, sa_lengths) + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); length_start = hdr->sa_lengths; } else { data_start = hdr; } for (i = 0; i != tb->lot_attr_count; i++) { int attr_length, reg_length; uint8_t idx_len; reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; if (reg_length) { attr_length = reg_length; idx_len = 0; } else { attr_length = length_start[length_idx]; idx_len = length_idx++; } func(hdr, data_start, tb->lot_attrs[i], attr_length, idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + attr_length), 8); } } /*ARGSUSED*/ void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { sa_handle_t *hdl = userp; sa_os_t *sa = hdl->sa_os->os_sa; sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); } void sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); dmu_buf_impl_t *db; sa_os_t *sa = hdl->sa_os->os_sa; int num_lengths = 1; int i; ASSERT(MUTEX_HELD(&sa->sa_lock)); if (sa_hdr_phys->sa_magic == SA_MAGIC) return; db = SA_GET_DB(hdl, buftype); if (buftype == SA_SPILL) { arc_release(db->db_buf, NULL); arc_buf_thaw(db->db_buf); } sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); /* * Determine number of variable lenghts in header * The standard 8 byte header has one for free and a * 16 byte header would have 4 + 1; */ if (SA_HDR_SIZE(sa_hdr_phys) > 8) num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; for (i = 0; i != num_lengths; i++) sa_hdr_phys->sa_lengths[i] = BSWAP_16(sa_hdr_phys->sa_lengths[i]); sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, sa_byteswap_cb, NULL, hdl); if (buftype == SA_SPILL) arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); } static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys; dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); sa_os_t *sa = hdl->sa_os->os_sa; sa_idx_tab_t *idx_tab; sa_hdr_phys = SA_GET_HDR(hdl, buftype); mutex_enter(&sa->sa_lock); /* Do we need to byteswap? */ /* only check if not old znode */ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && sa_hdr_phys->sa_magic != 0) { VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); sa_byteswap(hdl, buftype); } idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); if (buftype == SA_BONUS) hdl->sa_bonus_tab = idx_tab; else hdl->sa_spill_tab = idx_tab; mutex_exit(&sa->sa_lock); return (0); } /*ARGSUSED*/ static void sa_evict_sync(void *dbu) { panic("evicting sa dbuf\n"); } static void sa_idx_tab_rele(objset_t *os, void *arg) { sa_os_t *sa = os->os_sa; sa_idx_tab_t *idx_tab = arg; if (idx_tab == NULL) return; mutex_enter(&sa->sa_lock); if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); if (idx_tab->sa_variable_lengths) kmem_free(idx_tab->sa_variable_lengths, sizeof (uint16_t) * idx_tab->sa_layout->lot_var_sizes); refcount_destroy(&idx_tab->sa_refcount); kmem_free(idx_tab->sa_idx_tab, sizeof (uint32_t) * sa->sa_num_attrs); kmem_free(idx_tab, sizeof (sa_idx_tab_t)); } mutex_exit(&sa->sa_lock); } static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) { sa_os_t *sa = os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); (void) refcount_add(&idx_tab->sa_refcount, NULL); } void sa_handle_destroy(sa_handle_t *hdl) { dmu_buf_t *db = hdl->sa_bonus; mutex_enter(&hdl->sa_lock); (void) dmu_buf_remove_user(db, &hdl->sa_dbu); if (hdl->sa_bonus_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); if (hdl->sa_spill_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); } int sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; dmu_object_info_t doi; sa_handle_t *handle = NULL; #ifdef ZFS_DEBUG dmu_object_info_from_db(db, &doi); ASSERT(doi.doi_bonus_type == DMU_OT_SA || doi.doi_bonus_type == DMU_OT_ZNODE); #endif /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ if (hdl_type == SA_HDL_SHARED) handle = dmu_buf_get_user(db); if (handle == NULL) { sa_handle_t *winner = NULL; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); handle->sa_dbu.dbu_evict_func_sync = NULL; handle->sa_dbu.dbu_evict_func_async = NULL; handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; handle->sa_bonus_tab = NULL; handle->sa_spill_tab = NULL; error = sa_build_index(handle, SA_BONUS); if (hdl_type == SA_HDL_SHARED) { dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, NULL); winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); } if (winner != NULL) { kmem_cache_free(sa_cache, handle); handle = winner; } } *handlepp = handle; return (error); } int sa_handle_get(objset_t *objset, uint64_t objid, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { dmu_buf_t *db; int error; if (error = dmu_bonus_hold(objset, objid, NULL, &db)) return (error); return (sa_handle_get_from_db(objset, db, userp, hdl_type, handlepp)); } int sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) { return (dmu_bonus_hold(objset, obj_num, tag, db)); } void sa_buf_rele(dmu_buf_t *db, void *tag) { dmu_buf_rele(db, tag); } int sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); } int sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = buf; bulk.sa_length = buflen; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_lookup_impl(hdl, &bulk, 1); mutex_exit(&hdl->sa_lock); return (error); } #ifdef _KERNEL int sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) { int error; sa_bulk_attr_t bulk; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, uio->uio_resid), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); } #endif static sa_idx_tab_t * sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr) { sa_idx_tab_t *idx_tab; sa_os_t *sa = os->os_sa; sa_lot_t *tb, search; avl_index_t loc; /* * Deterimine layout number. If SA node and header == 0 then * force the index table to the dummy "1" empty layout. * * The layout number would only be zero for a newly created file * that has not added any attributes yet, or with crypto enabled which * doesn't write any attributes to the bonus buffer. */ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); /* Verify header size is consistent with layout information */ ASSERT(tb); ASSERT(IS_SA_BONUSTYPE(bonustype) && SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); /* * See if any of the already existing TOC entries can be reused? */ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { boolean_t valid_idx = B_TRUE; int i; if (tb->lot_var_sizes != 0 && idx_tab->sa_variable_lengths != NULL) { for (i = 0; i != tb->lot_var_sizes; i++) { if (hdr->sa_lengths[i] != idx_tab->sa_variable_lengths[i]) { valid_idx = B_FALSE; break; } } } if (valid_idx) { sa_idx_tab_hold(os, idx_tab); return (idx_tab); } } /* No such luck, create a new entry */ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); idx_tab->sa_idx_tab = kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); idx_tab->sa_layout = tb; refcount_create(&idx_tab->sa_refcount); if (tb->lot_var_sizes) idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * tb->lot_var_sizes, KM_SLEEP); sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, tb, idx_tab); sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ sa_idx_tab_hold(os, idx_tab); /* one for layout */ list_insert_tail(&tb->lot_idx_tab, idx_tab); return (idx_tab); } void sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, boolean_t start, void *userdata) { ASSERT(start); *dataptr = userdata; *len = total_len; } static void sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) { uint64_t attr_value = 0; sa_os_t *sa = hdl->sa_os->os_sa; sa_attr_table_t *tb = sa->sa_attr_table; int i; mutex_enter(&sa->sa_lock); if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { mutex_exit(&sa->sa_lock); return; } if (sa->sa_reg_attr_obj == 0) { sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, DMU_OT_SA_ATTR_REGISTRATION, sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) continue; ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, tb[i].sa_byteswap); VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, tb[i].sa_name, 8, 1, &attr_value, tx)); tb[i].sa_registered = B_TRUE; } sa->sa_need_attr_registration = B_FALSE; mutex_exit(&sa->sa_lock); } /* * Replace all attributes with attributes specified in template. * If dnode had a spill buffer then those attributes will be * also be replaced, possibly with just an empty spill block * * This interface is intended to only be used for bulk adding of * attributes for a new file. It will also be used by the ZPL * when converting and old formatted znode to native SA support. */ int sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); } int sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_replace_all_by_template_locked(hdl, attr_desc, attr_count, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Add/remove a single attribute or replace a variable-sized attribute value * with a value of a different size, and then rewrite the entire set * of attributes. * Same-length attribute value replacement (including fixed-length attributes) * is handled more efficiently by the upper layers. */ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; int bonus_data_size = 0; int spill_data_size = 0; int spill_attr_count = 0; int error; uint16_t length, reg_length; int i, j, k, length_idx; sa_hdr_phys_t *hdr; sa_idx_tab_t *idx_tab; int attr_count; int count; ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* First make of copy of the old data */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); bcopy(hdl->sa_bonus->db_data, old_data[0], hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { old_data[0] = NULL; } DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; } else if (error && error != ENOENT) { if (old_data[0]) kmem_free(old_data[0], bonus_data_size); return (error); } else { old_data[1] = NULL; } /* build descriptor of all attributes */ attr_count = bonus_attr_count + spill_attr_count; if (action == SA_ADD) attr_count++; else if (action == SA_REMOVE) attr_count--; attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); /* * loop through bonus and spill buffer if it exists, and * build up new attr_descriptor to reset the attributes */ k = j = 0; count = bonus_attr_count; hdr = SA_GET_HDR(hdl, SA_BONUS); idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); for (; k != 2; k++) { /* * Iterate over each attribute in layout. Fetch the * size of variable-length attributes needing rewrite * from sa_lengths[]. */ for (i = 0, length_idx = 0; i != count; i++) { sa_attr_type_t attr; attr = idx_tab->sa_layout->lot_attrs[i]; reg_length = SA_REGISTERED_LEN(sa, attr); if (reg_length == 0) { length = hdr->sa_lengths[length_idx]; length_idx++; } else { length = reg_length; } if (attr == newattr) { /* * There is nothing to do for SA_REMOVE, * so it is just skipped. */ if (action == SA_REMOVE) continue; /* * Duplicate attributes are not allowed, so the * action can not be SA_ADD here. */ ASSERT3S(action, ==, SA_REPLACE); /* * Only a variable-sized attribute can be * replaced here, and its size must be changing. */ ASSERT3U(reg_length, ==, 0); ASSERT3U(length, !=, buflen); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); } else { SA_ADD_BULK_ATTR(attr_desc, j, attr, NULL, (void *) (TOC_OFF(idx_tab->sa_idx_tab[attr]) + (uintptr_t)old_data[k]), length); } } if (k == 0 && hdl->sa_spill) { hdr = SA_GET_HDR(hdl, SA_SPILL); idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); count = spill_attr_count; } else { break; } } if (action == SA_ADD) { reg_length = SA_REGISTERED_LEN(sa, newattr); IMPLY(reg_length != 0, reg_length == buflen); SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, datastart, buflen); } ASSERT3U(j, ==, attr_count); error = sa_build_layouts(hdl, attr_desc, attr_count, tx); if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) kmem_free(old_data[1], spill_data_size); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); } static int sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, dmu_tx_t *tx) { int error; sa_os_t *sa = hdl->sa_os->os_sa; dmu_object_type_t bonustype; bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* sync out registration table if necessary */ if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) sa->sa_update_cb(hdl, tx); return (error); } /* * update or add new attribute */ int sa_update(sa_handle_t *hdl, sa_attr_type_t type, void *buf, uint32_t buflen, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = type; bulk.sa_data_func = NULL; bulk.sa_length = buflen; bulk.sa_data = buf; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = userdata; bulk.sa_data_func = locator; bulk.sa_length = buflen; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Return size of an attribute */ int sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) { sa_bulk_attr_t bulk; int error; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { mutex_exit(&hdl->sa_lock); return (error); } *size = bulk.sa_size; mutex_exit(&hdl->sa_lock); return (0); } int sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_lookup_impl(hdl, attrs, count)); } int sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_lookup_locked(hdl, attrs, count); mutex_exit(&hdl->sa_lock); return (error); } int sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, attrs, count, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, NULL, 0, tx); mutex_exit(&hdl->sa_lock); return (error); } void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, blksize, nblocks); } void sa_set_userp(sa_handle_t *hdl, void *ptr) { hdl->sa_userp = ptr; } dmu_buf_t * sa_get_db(sa_handle_t *hdl) { return ((dmu_buf_t *)hdl->sa_bonus); } void * sa_get_userdata(sa_handle_t *hdl) { return (hdl->sa_userp); } void sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) { ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); os->os_sa->sa_update_cb = func; } void sa_register_update_callback(objset_t *os, sa_update_cb_t *func) { mutex_enter(&os->os_sa->sa_lock); sa_register_update_callback_locked(os, func); mutex_exit(&os->os_sa->sa_lock); } uint64_t sa_handle_object(sa_handle_t *hdl) { return (hdl->sa_bonus->db_object); } boolean_t sa_enabled(objset_t *os) { return (os->os_sa == NULL); } int sa_set_sa_object(objset_t *os, uint64_t sa_object) { sa_os_t *sa = os->os_sa; if (sa->sa_master_obj) return (1); sa->sa_master_obj = sa_object; return (0); } int sa_hdrsize(void *arg) { sa_hdr_phys_t *hdr = arg; return (SA_HDR_SIZE(hdr)); } void sa_handle_lock(sa_handle_t *hdl) { ASSERT(hdl); mutex_enter(&hdl->sa_lock); } void sa_handle_unlock(sa_handle_t *hdl) { ASSERT(hdl); mutex_exit(&hdl->sa_lock); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 351077) @@ -1,2373 +1,2374 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #if defined(__FreeBSD__) && defined(_KERNEL) #include #include #endif /* * SPA locking * * There are four basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static int spa_active_count; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG /* * Everything except dprintf, spa, and indirect_remap is on by default * in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ boolean_t zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ boolean_t zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in a system panic. */ uint64_t zfs_deadman_synctime_ms = 1000000ULL; /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 5000ULL; /* * Default value of -1 for zfs_deadman_enabled is resolved in * zfs_deadman_init() */ int zfs_deadman_enabled = -1; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ int spa_asize_inflation = 24; #if defined(__FreeBSD__) && defined(_KERNEL) SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { int err, val; val = zfs_flags; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); /* * ZFS_DEBUG_MODIFY must be enabled prior to boot so all * arc buffers in the system have the necessary additional * checksum data. However, it is safe to disable at any * time. */ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) val &= ~ZFS_DEBUG_MODIFY; zfs_flags = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN, &zfs_deadman_synctime_ms, 0, "Stalled ZFS I/O expiration time in milliseconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN, &zfs_deadman_checktime_ms, 0, "Period of checks for stalled ZFS I/O in milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN, &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); #endif #ifndef illumos #ifdef _KERNEL static void zfs_deadman_init() { /* * If we are not i386 or amd64 or in a virtual machine, * disable ZFS deadman thread by default */ if (zfs_deadman_enabled == -1) { #if defined(__amd64__) || defined(__i386__) zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; #else zfs_deadman_enabled = 0; #endif } } #endif /* _KERNEL */ #endif /* !illumos */ /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed. This ensures that we don't run the pool * completely out of space, due to unaccounted changes (e.g. to the MOS). * It also limits the worst-case time to allocate space. If we have * less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, &spa_slop_shift, 0, "Shift value of reserved space (1/(2^spa_slop_shift))."); uint64_t spa_min_slop = 128 * 1024 * 1024; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, &spa_min_slop, 0, "Minimal value of reserved space"); int spa_allocators = 4; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN, &spa_allocators, 0, "Number of allocators per metaslab group"); /*PRINTFLIKE2*/ void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /*PRINTFLIKE2*/ void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); } } int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!refcount_is_zero(&scl->scl_count)) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_exit(spa_t *spa, int locks, void *tag) { for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(!refcount_is_zero(&scl->scl_count)); if (refcount_remove(&scl->scl_count, tag) == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ static void spa_deadman(void *arg, int pending) { spa_t *spa = arg; /* * Disable the deadman timer if the pool is suspended. */ if (spa_suspended(spa)) { #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); #else /* Nothing. just don't schedule any future callouts. */ #endif return; } zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, ++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev); #ifdef __FreeBSD__ #ifdef _KERNEL callout_schedule(&spa->spa_deadman_cycid, hz * zfs_deadman_checktime_ms / MILLISEC); #endif #endif } #if defined(__FreeBSD__) && defined(_KERNEL) static void spa_deadman_timeout(void *arg) { spa_t *spa = arg; taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); } #endif /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; #ifdef illumos cyc_handler_t hdlr; cyc_time_t when; #endif ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; #ifdef illumos hdlr.cyh_func = spa_deadman; hdlr.cyh_arg = spa; hdlr.cyh_level = CY_LOW_LEVEL; #endif spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); #ifdef illumos /* * This determines how often we need to check for hung I/Os after * the cyclic has already fired. Since checking for hung I/Os is * an expensive operation we don't want to check too frequently. * Instead wait for 5 seconds before checking again. */ when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); when.cyt_when = CY_INFINITY; mutex_enter(&cpu_lock); spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); mutex_exit(&cpu_lock); #else /* !illumos */ #ifdef _KERNEL /* * callout(9) does not provide a way to initialize a callout with * a function and an argument, so we use callout_reset() to schedule * the callout in the very distant future. Even if that event ever * fires, it should be okayas we won't have any active zio-s. * But normally spa_sync() will reschedule the callout with a proper * timeout. * callout(9) does not allow the callback function to sleep but * vdev_deadman() needs to acquire vq_lock and illumos mutexes are * emulated using sx(9). For this reason spa_deadman_timeout() * will schedule spa_deadman() as task on a taskqueue that allows * sleeping. */ TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); callout_init(&spa->spa_deadman_cycid, 1); callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, spa_deadman_timeout, spa, 0); #endif #endif refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) { spa->spa_root = spa_strdup(altroot); spa_active_count++; } spa->spa_alloc_count = spa_allocators; spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * sizeof (kmutex_t), KM_SLEEP); spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * sizeof (avl_tree_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) { spa_strfree(spa->spa_root); spa_active_count--; } while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_alloc_trees[i]); mutex_destroy(&spa->spa_alloc_locks[i]); } kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * sizeof (kmutex_t)); kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * sizeof (avl_tree_t)); list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); #ifdef illumos mutex_enter(&cpu_lock); if (spa->spa_deadman_cycid != CYCLIC_NONE) cyclic_remove(spa->spa_deadman_cycid); mutex_exit(&cpu_lock); spa->spa_deadman_cycid = CYCLIC_NONE; #else /* !illumos */ #ifdef _KERNEL callout_drain(&spa->spa_deadman_cycid); taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); #endif #endif refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, void *tag) { (void) refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (AVL_CMP(sa->aux_guid, sb->aux_guid)); } void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); mutex_exit(&vd->vdev_initialize_lock); } spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; if (vd != NULL || error == 0) vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 0, 0, B_FALSE); if (vd != NULL) { vdev_state_dirty(vd->vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(1M) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); bcopy(s, new, len); new[len] = '\0'; return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_get_random(uint64_t range) { uint64_t r; ASSERT(range != 0); (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); return (r % range); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid = spa_get_random(-1ULL); if (spa != NULL) { while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) guid = spa_get_random(-1ULL); } else { while (guid == 0 || spa_guid_exists(guid, 0)) guid = spa_get_random(-1ULL); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strncpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* ARGSUSED */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { return (lsize * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), * or at least 128MB, unless that would cause it to be more than half the * pool size. * * See the comment above spa_slop_shift for details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = spa_get_dspace(spa); return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); if (spa->spa_vdev_removal != NULL) { /* * We can't allocate from the removing device, so * subtract its size. This prevents the DMU/DSL from * filling up the (now smaller) pool while we are in the * middle of removing the device. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new * device). */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); spa->spa_dspace -= spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; spa_config_exit(spa, SCL_VDEV, FTAG); } } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint8_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { uint64_t vdev = DVA_GET_VDEV(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); if (vd == NULL) { panic( "dva_get_dsize_sync(): bad DVA %llu:%llu", (u_longlong_t)vdev, (u_longlong_t)asize); } dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (AVL_ISIGN(s)); } int spa_busy(void) { return (spa_active_count); } void spa_boot_init() { spa_config_load(); } #ifdef _KERNEL EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); #endif void spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifdef illumos #ifdef _KERNEL spa_arch_init(); #else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { perror("could not enable watchpoints: " "opening /proc/self/ctl failed: "); } else { arc_watch = B_TRUE; } } #endif #endif /* illumos */ refcount_sysinit(); unique_init(); range_tree_init(); metaslab_alloc_trace_init(); zio_init(); lz4_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); vdev_file_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); l2arc_start(); scan_init(); dsl_scan_global_init(); #ifndef illumos #ifdef _KERNEL zfs_deadman_init(); #endif #endif /* !illumos */ } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_cache_stat_fini(); zil_fini(); dmu_fini(); lz4_fini(); zio_fini(); metaslab_alloc_trace_fini(); range_tree_fini(); unique_fini(); refcount_fini(); scan_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has slogs. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_rotor != NULL); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } int spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (SET_ERROR(ENOENT)); bzero(ps, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } + /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == FREAD); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 351077) @@ -1,1015 +1,1015 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; /* * These flags are intended to be used to specify the "txg_how" * parameter when calling the dmu_tx_assign() function. See the comment * above dmu_tx_assign() for more details on the meaning of these flags. */ #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, struct nvlist *snaps); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); int dmu_objset_remap_indirects(const char *fsname); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *txp); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_will_dirty() * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ /*ARGSUSED*/ inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); #else int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, int last_size); #endif #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(void); void xuio_stat_wbuf_nocopy(void); extern boolean_t zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; - int8_t doi_pad[4]; + uint8_t doi_pad[4]; uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Check if a DMU object has any dirty blocks. If so, sync out * all pending transaction groups. Otherwise, this function * does not alter DMU state. This could be improved to only sync * out the necessary transaction groups for this particular * object. */ int dmu_object_wait_synced(objset_t *os, uint64_t object); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct file *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h (revision 351077) @@ -1,216 +1,212 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif extern krwlock_t os_lock; struct dsl_pool; struct dsl_dataset; struct dmu_tx; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 #define OBJSET_BUF_HAS_USERUSED(buf) \ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; uint64_t os_flags; char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - sizeof (zil_header_t) - sizeof (uint64_t)*2]; dnode_phys_t os_userused_dnode; dnode_phys_t os_groupused_dnode; } objset_phys_t; #define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) struct objset { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* * The following "special" dnodes have no parent, are exempt * from dnode_move(), and are not recorded in os_dnodes, but they * root their descendents in this objset using handles anyway, so * that all access to dnodes from dbufs consistently uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; list_node_t os_evicting_node; /* can change, under dsl_dir's locks: */ uint64_t os_dnodesize; /* default dnode size for new objects */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; int os_recordsize; /* * The next four values are used as a cache of whatever's on disk, and * are initialized the first time these properties are queried. Before * being initialized with their real values, their values are * OBJSET_PROP_UNINITIALIZED. */ uint64_t os_version; uint64_t os_normalization; uint64_t os_utf8only; uint64_t os_casesensitivity; /* * Pointer is constant; the blkptr it points to is protected by * os_dsl_dataset->ds_bp_rwlock */ blkptr_t *os_rootbp; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ zil_header_t os_zil_header; multilist_t *os_synced_dnodes; uint64_t os_flags; uint64_t os_freed_dnodes; boolean_t os_rescan_dnodes; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; - uint64_t os_obj_next_chunk; - - /* Per-CPU next object to allocate, protected by atomic ops. */ - uint64_t *os_obj_next_percpu; - int os_obj_next_percpu_len; + uint64_t os_obj_next; /* Protected by os_lock */ kmutex_t os_lock; multilist_t *os_dirty_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; /* Protects changes to DMU_{USER,GROUP}USED_OBJECT */ kmutex_t os_userused_lock; /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; sa_os_t *os_sa; }; #define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) #define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) #define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE) /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_refresh_ownership(struct dsl_dataset *ds, struct dsl_dataset **newds, void *tag); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, int func(struct dsl_pool *, struct dsl_dataset *, void *), void *arg, int flags); int dmu_objset_prefetch(const char *name, void *arg); void dmu_objset_evict_dbufs(objset_t *os); timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); void dmu_objset_init(void); void dmu_objset_fini(void); #ifdef __cplusplus } #endif #endif /* _SYS_DMU_OBJSET_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 351077) @@ -1,588 +1,411 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DNODE_H #define _SYS_DNODE_H #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * dnode_hold() flags. */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 /* * dnode_next_offset() flags. */ #define DNODE_FIND_HOLE 1 #define DNODE_FIND_BACKWARDS 2 #define DNODE_FIND_HAVELOCK 4 /* * Fixed constants. */ #define DNODE_SHIFT 9 /* 512 bytes */ #define DN_MIN_INDBLKSHIFT 12 /* 4k */ /* * If we ever increase this value beyond 20, we need to revisit all logic that * does x << level * ebps to handle overflow. With a 1M indirect block size, * 4 levels of indirect blocks would not be able to guarantee addressing an * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65. */ #define DN_MAX_INDBLKSHIFT 17 /* 128k */ #define DNODE_BLOCK_SHIFT 14 /* 16k */ #define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ #define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ #define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ /* * dnode id flags * * Note: a file will never ever have its * ids moved from bonus->spill * and only in a crypto environment would it be on spill */ #define DN_ID_CHKED_BONUS 0x1 #define DN_ID_CHKED_SPILL 0x2 #define DN_ID_OLD_EXIST 0x4 #define DN_ID_NEW_EXIST 0x8 /* * Derived constants. */ #define DNODE_MIN_SIZE (1 << DNODE_SHIFT) #define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) #define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) #define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) #define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) #define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ (1 << SPA_BLKPTRSHIFT)) #define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) #define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) #define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) -#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) -#define DN_KILL_SPILLBLK (1) +#define DN_KILL_SPILLBLK (1) -#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ -#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ -#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ -#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ -#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) -#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) - #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) /* * This is inaccurate if the indblkshift of the particular object is not the * max. But it's only used by userland to calculate the zvol reservation. */ #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) #define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) #define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) #define DN_MAX_BONUS_LEN(dnp) \ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) - + #define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) #define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) struct dmu_buf_impl; struct objset; struct zio; enum dnode_dirtycontext { DN_UNDIRTIED, DN_DIRTY_OPEN, DN_DIRTY_SYNC }; /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ #define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) /* Does dnode have a SA spill blkptr in bonus? */ #define DNODE_FLAG_SPILL_BLKPTR (1<<2) -/* - * VARIABLE-LENGTH (LARGE) DNODES - * - * The motivation for variable-length dnodes is to eliminate the overhead - * associated with using spill blocks. Spill blocks are used to store - * system attribute data (i.e. file metadata) that does not fit in the - * dnode's bonus buffer. By allowing a larger bonus buffer area the use of - * a spill block can be avoided. Spill blocks potentially incur an - * additional read I/O for every dnode in a dnode block. As a worst case - * example, reading 32 dnodes from a 16k dnode block and all of the spill - * blocks could issue 33 separate reads. Now suppose those dnodes have size - * 1024 and therefore don't need spill blocks. Then the worst case number - * of blocks read is reduced to from 33 to two--one per dnode block. - * - * ZFS-on-Linux systems that make heavy use of extended attributes benefit - * from this feature. In particular, ZFS-on-Linux supports the xattr=sa - * dataset property which allows file extended attribute data to be stored - * in the dnode bonus buffer as an alternative to the traditional - * directory-based format. Workloads such as SELinux and the Lustre - * distributed filesystem often store enough xattr data to force spill - * blocks when xattr=sa is in effect. Large dnodes may therefore provide a - * performance benefit to such systems. Other use cases that benefit from - * this feature include files with large ACLs and symbolic links with long - * target names. - * - * The size of a dnode may be a multiple of 512 bytes up to the size of a - * dnode block (currently 16384 bytes). The dn_extra_slots field of the - * on-disk dnode_phys_t structure describes the size of the physical dnode - * on disk. The field represents how many "extra" dnode_phys_t slots a - * dnode consumes in its dnode block. This convention results in a value of - * 0 for 512 byte dnodes which preserves on-disk format compatibility with - * older software which doesn't support large dnodes. - * - * Similarly, the in-memory dnode_t structure has a dn_num_slots field - * to represent the total number of dnode_phys_t slots consumed on disk. - * Thus dn->dn_num_slots is 1 greater than the corresponding - * dnp->dn_extra_slots. This difference in convention was adopted - * because, unlike on-disk structures, backward compatibility is not a - * concern for in-memory objects, so we used a more natural way to - * represent size for a dnode_t. - * - * The default size for newly created dnodes is determined by the value of - * the "dnodesize" dataset property. By default the property is set to - * "legacy" which is compatible with older software. Setting the property - * to "auto" will allow the filesystem to choose the most suitable dnode - * size. Currently this just sets the default dnode size to 1k, but future - * code improvements could dynamically choose a size based on observed - * workload patterns. Dnodes of varying sizes can coexist within the same - * dataset and even within the same dnode block. - */ - typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ uint8_t dn_nblkptr; /* length of dn_blkptr */ uint8_t dn_bonustype; /* type of data in bonus buffer */ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ uint8_t dn_compress; /* ZIO_COMPRESS type */ uint8_t dn_flags; /* DNODE_FLAG_* */ uint16_t dn_datablkszsec; /* data block size in 512b sectors */ uint16_t dn_bonuslen; /* length of dn_bonus */ uint8_t dn_extra_slots; /* # of subsequent slots consumed */ uint8_t dn_pad2[3]; /* accounting is protected by dn_dirty_mtx */ uint64_t dn_maxblkid; /* largest allocated block ID */ uint64_t dn_used; /* bytes (or sectors) of disk space */ /* * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This * allows us to protect any fields that might be added here in the * future. In either case, developers will want to check * zio_crypt_init_uios_dnode() to ensure the new field is being * protected properly. */ uint64_t dn_pad3[4]; + /* + * The tail region is 448 bytes for a 512 byte dnode, and + * correspondingly larger for larger dnode sizes. The spill + * block pointer, when present, is always at the end of the tail + * region. There are three ways this space may be used, using + * a 512 byte dnode for this diagram: + * + * 0 64 128 192 256 320 384 448 (offset) + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / | + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_bonus[0..319] | + * +---------------+-----------------------+---------------+ + * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill | + * +---------------+-----------------------+---------------+ + */ union { blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; struct { blkptr_t __dn_ignore1; uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; }; struct { blkptr_t __dn_ignore2; uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)]; blkptr_t dn_spill; - }; + }; }; } dnode_phys_t; #define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) struct dnode { /* * Protects the structure of the dnode, including the number of levels * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* */ krwlock_t dn_struct_rwlock; /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ list_node_t dn_link; /* immutable: */ struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; struct dnode_handle *dn_handle; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ /* * Copies of stuff in dn_phys. They're valid in the open * context (eg. even before the dnode is first synced). * Where necessary, these are protected by dn_struct_rwlock. */ dmu_object_type_t dn_type; /* object type */ uint16_t dn_bonuslen; /* bonus length */ uint8_t dn_bonustype; /* bonus type */ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ uint8_t dn_compress; /* ZIO_COMPRESS type */ uint8_t dn_nlevels; uint8_t dn_indblkshift; uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ uint8_t dn_moved; /* Has this dnode been moved? */ uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; uint8_t dn_next_type[TXG_SIZE]; uint8_t dn_num_slots; /* metadnode slots consumed on disk */ uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; uint8_t dn_next_bonustype[TXG_SIZE]; uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ uint32_t dn_dbufs_count; /* count of dn_dbufs */ /* protected by os_lock: */ multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ /* protected by dn_mtx: */ kmutex_t dn_mtx; list_t dn_dirty_records[TXG_SIZE]; struct range_tree *dn_free_ranges[TXG_SIZE]; uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; kcondvar_t dn_notxholds; enum dnode_dirtycontext dn_dirtyctx; uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ /* protected by own devices */ refcount_t dn_tx_holds; refcount_t dn_holds; kmutex_t dn_dbufs_mtx; /* * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs * can contain multiple dbufs of the same (level, blkid) when a * dbuf is marked DB_EVICTING without being removed from * dn_dbufs. To maintain the avl invariant that there cannot be * duplicate entries, we order the dbufs by an arbitrary value - * their address in memory. This means that dn_dbufs cannot be used to * directly look up a dbuf. Instead, callers must use avl_walk, have * a reference to the dbuf, or look up a non-existant node with * db_state = DB_SEARCH (see dbuf_free_range for an example). */ avl_tree_t dn_dbufs; /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ zio_t *dn_zio; /* used in syncing context */ uint64_t dn_oldused; /* old phys used bytes */ uint64_t dn_oldflags; /* old phys dn_flags */ uint64_t dn_olduid, dn_oldgid; uint64_t dn_newuid, dn_newgid; int dn_id_flags; /* holds prefetch structure */ struct zfetch dn_zfetch; }; /* * Adds a level of indirection between the dbuf and the dnode to avoid * iterating descendent dbufs in dnode_move(). Handles are not allocated * individually, but as an array of child dnodes in dnode_hold_impl(). */ typedef struct dnode_handle { /* Protects dnh_dnode from modification by dnode_move(). */ zrlock_t dnh_zrlock; dnode_t *dnh_dnode; } dnode_handle_t; typedef struct dnode_children { dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; typedef struct free_range { avl_node_t fr_node; uint64_t fr_blkid; uint64_t fr_nblks; } free_range_t; void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); void dnode_free(dnode_t *dn, dmu_tx_t *tx); void dnode_byteswap(dnode_phys_t *dnp); void dnode_buf_byteswap(void *buf, size_t size); void dnode_verify(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); -void dnode_free_interior_slots(dnode_t *dn); boolean_t dnode_needs_remap(const dnode_t *dn); #define DNODE_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (DMU_OT_IS_METADATA((_dn)->dn_type) && \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) #define DNODE_META_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) - -/* - * Used for dnodestats kstat. - */ -typedef struct dnode_stats { - /* - * Number of failed attempts to hold a meta dnode dbuf. - */ - kstat_named_t dnode_hold_dbuf_hold; - /* - * Number of failed attempts to read a meta dnode dbuf. - */ - kstat_named_t dnode_hold_dbuf_read; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able - * to hold the requested object number which was allocated. This is - * the common case when looking up any allocated object number. - */ - kstat_named_t dnode_hold_alloc_hits; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not - * able to hold the request object number because it was not allocated. - */ - kstat_named_t dnode_hold_alloc_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not - * able to hold the request object number because the object number - * refers to an interior large dnode slot. - */ - kstat_named_t dnode_hold_alloc_interior; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed - * to retry acquiring slot zrl locks due to contention. - */ - kstat_named_t dnode_hold_alloc_lock_retry; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not - * need to create the dnode because another thread did so after - * dropping the read lock but before acquiring the write lock. - */ - kstat_named_t dnode_hold_alloc_lock_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found - * a free dnode instantiated by dnode_create() but not yet allocated - * by dnode_allocate(). - */ - kstat_named_t dnode_hold_alloc_type_none; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able - * to hold the requested range of free dnode slots. - */ - kstat_named_t dnode_hold_free_hits; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not - * able to hold the requested range of free dnode slots because - * at least one slot was allocated. - */ - kstat_named_t dnode_hold_free_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not - * able to hold the requested range of free dnode slots because - * after acquiring the zrl lock at least one slot was allocated. - */ - kstat_named_t dnode_hold_free_lock_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed - * to retry acquiring slot zrl locks due to contention. - */ - kstat_named_t dnode_hold_free_lock_retry; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested - * a range of dnode slots which were held by another thread. - */ - kstat_named_t dnode_hold_free_refcount; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested - * a range of dnode slots which would overflow the dnode_phys_t. - */ - kstat_named_t dnode_hold_free_overflow; - /* - * Number of times a dnode_hold(...) was attempted on a dnode - * which had already been unlinked in an earlier txg. - */ - kstat_named_t dnode_hold_free_txg; - /* - * Number of times dnode_free_interior_slots() needed to retry - * acquiring a slot zrl lock due to contention. - */ - kstat_named_t dnode_free_interior_lock_retry; - /* - * Number of new dnodes allocated by dnode_allocate(). - */ - kstat_named_t dnode_allocate; - /* - * Number of dnodes re-allocated by dnode_reallocate(). - */ - kstat_named_t dnode_reallocate; - /* - * Number of meta dnode dbufs evicted. - */ - kstat_named_t dnode_buf_evict; - /* - * Number of times dmu_object_alloc*() reached the end of the existing - * object ID chunk and advanced to a new one. - */ - kstat_named_t dnode_alloc_next_chunk; - /* - * Number of times multiple threads attempted to allocate a dnode - * from the same block of free dnodes. - */ - kstat_named_t dnode_alloc_race; - /* - * Number of times dmu_object_alloc*() was forced to advance to the - * next meta dnode dbuf due to an error from dmu_object_next(). - */ - kstat_named_t dnode_alloc_next_block; - /* - * Statistics for tracking dnodes which have been moved. - */ - kstat_named_t dnode_move_invalid; - kstat_named_t dnode_move_recheck1; - kstat_named_t dnode_move_recheck2; - kstat_named_t dnode_move_special; - kstat_named_t dnode_move_handle; - kstat_named_t dnode_move_rwlock; - kstat_named_t dnode_move_active; -} dnode_stats_t; - -extern dnode_stats_t dnode_stats; - -#define DNODE_STAT_INCR(stat, val) \ - atomic_add_64(&dnode_stats.stat.value.ui64, (val)); -#define DNODE_STAT_BUMP(stat) \ - DNODE_STAT_INCR(stat, 1); #ifdef ZFS_DEBUG /* * There should be a ## between the string literal and fmt, to make it * clear that we're joining two strings together, but that piece of shit * gcc doesn't support that preprocessor token. */ #define dprintf_dnode(dn, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char __db_buf[32]; \ uint64_t __db_obj = (dn)->dn_object; \ if (__db_obj == DMU_META_DNODE_OBJECT) \ (void) strcpy(__db_buf, "mdn"); \ else \ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj);\ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ __db_buf, __VA_ARGS__); \ } \ _NOTE(CONSTCOND) } while (0) #define DNODE_VERIFY(dn) dnode_verify(dn) #define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) #else #define dprintf_dnode(db, fmt, ...) #define DNODE_VERIFY(dn) #define FREE_VERIFY(db, start, end, tx) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DNODE_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h (revision 351077) @@ -1,291 +1,291 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SA_IMPL_H #define _SYS_SA_IMPL_H #include #include #include /* * Array of known attributes and their * various characteristics. */ typedef struct sa_attr_table { sa_attr_type_t sa_attr; uint8_t sa_registered; uint16_t sa_length; sa_bswap_type_t sa_byteswap; char *sa_name; } sa_attr_table_t; /* * Zap attribute format for attribute registration * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | unused | len | bswap | attr num | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Zap attribute format for layout information. * * layout information is stored as an array of attribute numbers * The name of the attribute is the layout number (0, 1, 2, ...) * * 16 0 * +---- ---+ * | attr # | * +--------+ * | attr # | * +--- ----+ * ...... * */ #define ATTR_BSWAP(x) BF32_GET(x, 16, 8) #define ATTR_LENGTH(x) BF32_GET(x, 24, 16) #define ATTR_NUM(x) BF32_GET(x, 0, 16) #define ATTR_ENCODE(x, attr, length, bswap) \ { \ BF64_SET(x, 24, 16, length); \ BF64_SET(x, 16, 8, bswap); \ BF64_SET(x, 0, 16, attr); \ } #define TOC_OFF(x) BF32_GET(x, 0, 23) #define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) #define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) #define TOC_ATTR_ENCODE(x, len_idx, offset) \ { \ BF32_SET(x, 31, 1, 1); \ BF32_SET(x, 24, 7, len_idx); \ BF32_SET(x, 0, 24, offset); \ } #define SA_LAYOUTS "LAYOUTS" #define SA_REGISTRY "REGISTRY" /* * Each unique layout will have their own table * sa_lot (layout_table) */ typedef struct sa_lot { avl_node_t lot_num_node; avl_node_t lot_hash_node; uint64_t lot_num; uint64_t lot_hash; sa_attr_type_t *lot_attrs; /* array of attr #'s */ uint32_t lot_var_sizes; /* how many aren't fixed size */ uint32_t lot_attr_count; /* total attr count */ - list_t lot_idx_tab; /* should be only a couple of entries */ + list_t lot_idx_tab; /* should be only a couple of entries */ int lot_instance; /* used with lot_hash to identify entry */ } sa_lot_t; /* index table of offsets */ typedef struct sa_idx_tab { list_node_t sa_next; sa_lot_t *sa_layout; uint16_t *sa_variable_lengths; refcount_t sa_refcount; uint32_t *sa_idx_tab; /* array of offsets */ } sa_idx_tab_t; /* * Since the offset/index information into the actual data * will usually be identical we can share that information with * all handles that have the exact same offsets. * * You would typically only have a large number of different table of * contents if you had a several variable sized attributes. * * Two AVL trees are used to track the attribute layout numbers. * one is keyed by number and will be consulted when a DMU_OT_SA * object is first read. The second tree is keyed by the hash signature * of the attributes and will be consulted when an attribute is added * to determine if we already have an instance of that layout. Both * of these tree's are interconnected. The only difference is that * when an entry is found in the "hash" tree the list of attributes will * need to be compared against the list of attributes you have in hand. * The assumption is that typically attributes will just be updated and * adding a completely new attribute is a very rare operation. */ struct sa_os { - kmutex_t sa_lock; + kmutex_t sa_lock; boolean_t sa_need_attr_registration; boolean_t sa_force_spill; uint64_t sa_master_obj; uint64_t sa_reg_attr_obj; uint64_t sa_layout_attr_obj; int sa_num_attrs; sa_attr_table_t *sa_attr_table; /* private attr table */ sa_update_cb_t *sa_update_cb; avl_tree_t sa_layout_num_tree; /* keyed by layout number */ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ int sa_user_table_sz; sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ }; /* * header for all bonus and spill buffers. * * The header has a fixed portion with a variable number * of "lengths" depending on the number of variable sized * attributes which are determined by the "layout number" */ #define SA_MAGIC 0x2F505A /* ZFS SA */ typedef struct sa_hdr_phys { uint32_t sa_magic; /* BEGIN CSTYLED */ /* * Encoded with hdrsize and layout number as follows: * 16 10 0 * +--------+-------+ * | hdrsz |layout | * +--------+-------+ * * Bits 0-10 are the layout number * Bits 11-16 are the size of the header. * The hdrsize is the number * 8 * * For example. * hdrsz of 1 ==> 8 byte header * 2 ==> 16 byte header * */ /* END CSTYLED */ uint16_t sa_layout_info; uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ /* ... Data follows the lengths. */ } sa_hdr_phys_t; #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) #define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ { \ BF32_SET_SB(x, 10, 6, 3, 0, size); \ BF32_SET(x, 0, 10, num); \ } typedef enum sa_buf_type { SA_BONUS = 1, SA_SPILL = 2 } sa_buf_type_t; typedef enum sa_data_op { SA_LOOKUP, SA_UPDATE, SA_ADD, SA_REPLACE, SA_REMOVE } sa_data_op_t; /* * Opaque handle used for most sa functions * * This needs to be kept as small as possible. */ struct sa_handle { dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; #define SA_GET_DB(hdl, type) \ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) #define SA_GET_HDR(hdl, type) \ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ type))->db.db_data)) #define SA_IDX_TAB_GET(hdl, type) \ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) #define IS_SA_BONUSTYPE(a) \ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) #define SA_BONUSTYPE_FROM_DB(db) \ (dmu_get_bonustype((dmu_buf_t *)db)) #define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)) #define SA_LAYOUT_NUM(x, type) \ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) #define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length #define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ SA_REGISTERED_LEN(sa, attr)) #define SA_SET_HDR(hdr, num, size) \ { \ hdr->sa_magic = SA_MAGIC; \ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ } #define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ { \ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ bulk.sa_buftype = type; \ bulk.sa_addr = \ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ (uintptr_t)hdr); \ } #define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ sizeof (uint16_t), 8) : 0))) int sa_add_impl(sa_handle_t *, sa_attr_type_t, uint32_t, sa_data_locator_t, void *, dmu_tx_t *); void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, uint16_t *, sa_hdr_phys_t *); #ifdef __cplusplus extern "C" { #endif #ifdef __cplusplus } #endif #endif /* _SYS_SA_IMPL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h (revision 351077) @@ -1,511 +1,509 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H /* * ZAP - ZFS Attribute Processor * * The ZAP is a module which sits on top of the DMU (Data Management * Unit) and implements a higher-level storage primitive using DMU * objects. Its primary consumer is the ZPL (ZFS Posix Layer). * * A "zapobj" is a DMU object which the ZAP uses to stores attributes. * Users should use only zap routines to access a zapobj - they should * not access the DMU object directly using DMU routines. * * The attributes stored in a zapobj are name-value pairs. The name is * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including * terminating NULL). The value is an array of integers, which may be * 1, 2, 4, or 8 bytes long. The total space used by the array (number * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. * Note that an 8-byte integer value can be used to store the location * (object number) of another dmu object (which may be itself a zapobj). * Note that you can use a zero-length attribute to store a single bit * of information - the attribute is present or not. * * The ZAP routines are thread-safe. However, you must observe the * DMU's restriction that a transaction may not be operated on * concurrently. * * Any of the routines that return an int may return an I/O error (EIO * or ECHECKSUM). * * * Implementation / Performance Notes: * * The ZAP is intended to operate most efficiently on attributes with * short (49 bytes or less) names and single 8-byte values, for which * the microzap will be used. The ZAP should be efficient enough so * that the user does not need to cache these attributes. * * The ZAP's locking scheme makes its routines thread-safe. Operations * on different zapobjs will be processed concurrently. Operations on * the same zapobj which only read data will be processed concurrently. * Operations on the same zapobj which modify data will be processed * concurrently when there are many attributes in the zapobj (because * the ZAP uses per-block locking - more than 128 * (number of cpus) * small attributes will suffice). */ /* * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C * strings) for the names of attributes, rather than a byte string * bounded by an explicit length. If some day we want to support names * in character sets which have embedded zeros (eg. UTF-16, UTF-32), * we'll have to add routines for using length-bounded strings. */ #include #include #ifdef __cplusplus extern "C" { #endif /* * Specifies matching criteria for ZAP lookups. * MT_NORMALIZE Use ZAP normalization flags, which can include both * unicode normalization and case-insensitivity. * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is * specified and ZAP normalization flags include * U8_TEXTPREP_TOUPPER. */ typedef enum matchtype { MT_NORMALIZE = 1 << 0, MT_MATCH_CASE = 1 << 1, } matchtype_t; typedef enum zap_flags { /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ ZAP_FLAG_HASH64 = 1 << 0, /* Key is binary, not string (zap_add_uint64() can be used) */ ZAP_FLAG_UINT64_KEY = 1 << 1, /* * First word of key (which must be an array of uint64) is * already randomly distributed. */ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, } zap_flags_t; /* * Create a new zapobj with no attributes and return its object number. * * dnodesize specifies the on-disk size of the dnode for the new zapobj. * Valid values are multiples of 512 up to DNODE_MAX_SIZE. */ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, - uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); /* * Initialize an already-allocated object. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) * object number. */ int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); int zap_create_claim_norm(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); /* * The zapobj passed in must be a valid ZAP object for all of the * following routines. */ /* * Destroy this zapobj and all its attributes. * * Frees the object number using dmu_object_free. */ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); /* * Manipulate attributes. * * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. */ /* * Retrieve the contents of the attribute with the given name. * * If the requested attribute does not exist, the call will fail and * return ENOENT. * * If 'integer_size' is smaller than the attribute's integer size, the * call will fail and return EINVAL. * * If 'integer_size' is equal to or larger than the attribute's integer * size, the call will succeed and return 0. * * When converting to a larger integer size, the integers will be treated as * unsigned (ie. no sign-extension will be performed). * * 'num_integers' is the length (in integers) of 'buf'. * * If the attribute is longer than the buffer, as many integers as will * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. */ int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); /* * If rn_len is nonzero, realname will be set to the name of the found * entry (which may be different from the requested name if matchtype is * not MT_EXACT). * * If normalization_conflictp is not NULL, it will be set if there is * another name with the same case/unicode normalized form. */ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp); int zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, refcount_t *towrite, refcount_t *tooverwrite); /* * Create an attribute with the given name and value. * * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an * attribute with the given name does not exist, it will be created. If * an attribute with the given name already exists, the previous value * will be overwritten. The integer_size may be different from the * existing attribute's integer size, in which case the attribute's * integer size will be updated to the new value. */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified * attribute. * * If the requested attribute does not exist, the call will fail and * return ENOENT. */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. * * If the specified attribute does not exist, the call will fail and * return ENOENT. */ int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap * object. */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string * pointed to by name must be at least 256 bytes long. If mask==0, the * match must be exact (ie, same as mask=-1ULL). */ int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name); /* * Transfer all the entries from fromobj into intoobj. Only works on * int_size=8 num_integers=1 values. Fails if there are any duplicated * entries. */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* Same as zap_join, but set the values to 'value'. */ int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx); /* Same as zap_join, but add together any duplicated entries. */ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). */ int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx); /* Here the key is an int and the value is a different int. */ int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); struct zap; struct zap_leaf; typedef struct zap_cursor { /* This structure is opaque! */ objset_t *zc_objset; struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; } zap_cursor_t; typedef struct { int za_integer_length; /* * za_normalization_conflict will be set if there are additional * entries with this normalized form (eg, "foo" and "Foo"). */ boolean_t za_normalization_conflict; uint64_t za_num_integers; uint64_t za_first_integer; /* no sign extension for <8byte ints */ char za_name[ZAP_MAXNAMELEN]; } zap_attribute_t; /* * The interface for listing all the attributes of a zapobj can be * thought of as cursor moving down a list of the attributes one by * one. The cookie returned by the zap_cursor_serialize routine is * persistent across system calls (and across reboot, even). */ /* * Initialize a zap cursor, pointing to the "first" attribute of the * zapobj. You must _fini the cursor when you are done with it. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); void zap_cursor_fini(zap_cursor_t *zc); /* * Get the attribute currently pointed to by the cursor. Returns * ENOENT if at the end of the attributes. */ int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); /* * Advance the cursor to the next attribute. */ void zap_cursor_advance(zap_cursor_t *zc); /* * Get a persistent cookie pointing to the current position of the zap * cursor. The low 4 bits in the cookie are always zero, and thus can * be used as to differentiate a serialized cookie from a different type * of value. The cookie will be less than 2^32 as long as there are * fewer than 2^22 (4.2 million) entries in the zap object. */ uint64_t zap_cursor_serialize(zap_cursor_t *zc); /* * Advance the cursor to the attribute having the given key. */ int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); /* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also * use a "serialized" argument of 0 to start at the beginning of the * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to * zap_cursor_init(...).) */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj, uint64_t serialized); #define ZAP_HISTOGRAM_SIZE 10 typedef struct zap_stats { /* * Size of the pointer table (in number of entries). * This is always a power of 2, or zero if it's a microzap. * In general, it should be considerably greater than zs_num_leafs. */ uint64_t zs_ptrtbl_len; uint64_t zs_blocksize; /* size of zap blocks */ /* * The number of blocks used. Note that some blocks may be * wasted because old ptrtbl's and large name/value blocks are * not reused. (Although their space is reclaimed, we don't * reuse those offsets in the object.) */ uint64_t zs_num_blocks; /* * Pointer table values from zap_ptrtbl in the zap_phys_t */ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ uint64_t zs_ptrtbl_zt_blk; /* starting block number */ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ /* * Values of the other members of the zap_phys_t */ uint64_t zs_block_type; /* ZBT_HEADER */ uint64_t zs_magic; /* ZAP_MAGIC */ uint64_t zs_num_leafs; /* The number of leaf blocks */ uint64_t zs_num_entries; /* The number of zap entries */ uint64_t zs_salt; /* salt to stir into hash function */ /* * Histograms. For all histograms, the last index * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater * than what can be represented. For example * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number * of leafs with more than 45 entries. */ /* * zs_leafs_with_n_pointers[n] is the number of leafs with * 2^n pointers to it. */ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_with_n_entries[n] is the number of leafs with * [n*5, (n+1)*5) entries. In the current implementation, there * can be at most 55 entries in any block, but there may be * fewer if the name or value is large, or the block is not * completely full. */ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_n_tenths_full[n] is the number of leafs whose * fullness is in the range [n/10, (n+1)/10). */ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; /* * zs_entries_using_n_chunks[n] is the number of entries which * consume n 24-byte chunks. (Note, large names/values only use * one chunk, but contribute to zs_num_blocks_large.) */ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; /* * zs_buckets_with_n_entries[n] is the number of buckets (each * leaf has 64 buckets) with n entries. * zs_buckets_with_n_entries[1] should be very close to * zs_num_entries. */ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; } zap_stats_t; /* * Get statistics about a ZAP object. Note: you need to be aware of the * internal implementation of the ZAP to correctly interpret some of the * statistics. This interface shouldn't be relied on unless you really * know what you're doing. */ int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 351077) @@ -1,146 +1,145 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef illumos #include #endif #include #include -#include #include #include #include #include #include #include #include #include #define boot_ncpus (mp_ncpus) #define CPU_SEQID (curcpu) #define tsd_create(keyp, destructor) do { \ *(keyp) = osd_thread_register((destructor)); \ KASSERT(*(keyp) > 0, ("cannot register OSD")); \ } while (0) #define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) #define tsd_get(key) osd_thread_get(curthread, (key)) #define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) #ifdef __cplusplus } #endif extern int zfs_debug_level; extern struct mtx zfs_debug_mtx; #define ZFS_LOG(lvl, ...) do { \ if (((lvl) & 0xff) <= zfs_debug_level) { \ mtx_lock(&zfs_debug_mtx); \ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ printf(__VA_ARGS__); \ printf("\n"); \ if ((lvl) & 0x100) \ kdb_backtrace(); \ mtx_unlock(&zfs_debug_mtx); \ } \ } while (0) #define sys_shutdown rebooting #define noinline __attribute__((noinline)) #define likely(x) __builtin_expect((x), 1) #endif /* _SYS_ZFS_CONTEXT_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 351077) @@ -1,466 +1,466 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* flag #21 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) -/* flag #24 is reserved for the raw send feature */ +/* flag #24 is reserved for the raw send (encryption) feature */ /* flag #25 is reserved for the ZSTD compression feature */ /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_COMPRESSED) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | reserved | feature-flags |C|S| + * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. */ #define DMU_BACKUP_MAGIC 0x2F5bacbacULL /* * Send stream flags. Bits 24-31 are reserved for vendor-specific * implementations and should not be used. */ #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * flags in the drr_checksumflags field in the DRR_WRITE and * DRR_WRITE_BYREF blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) /* deal with compressed drr_write replay records */ #define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) #define DRR_WRITE_PAYLOAD_SIZE(drrw) \ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ (drrw)->drr_logical_size) typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; } drr_begin; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_dn_slots; uint8_t drr_pad[5]; uint64_t drr_toguid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_compressiontype; uint8_t drr_pad2[5]; /* deduplication key */ ddt_key_t drr_key; /* only nonzero if drr_compressiontype is not 0 */ uint64_t drr_compressed_size; /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_free; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint64_t drr_pad[4]; /* needed for crypto */ /* spill data follows */ } drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; /* * Nore: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; /* * Note: this struct must have the same layout in 32-bit and 64-bit, so * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit * kernel. Therefore, we add padding to it so that no "hidden" padding * is automatically added on 64-bit (but not on 32-bit). */ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad3[3]; boolean_t zc_resumable; uint32_t zc_pad4; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL struct objset; struct zfsvfs; typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); extern int zfs_busy(void); extern void zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); #ifdef illumos extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); #else extern int getzfsvfs_impl(struct objset *, vfs_t **); #endif extern int getzfsvfs(const char *, struct zfsvfs **); /* * ZFS minor numbers can refer to either a control device instance or * a zvol. Depending on the value of zss_type, zss_data points to either * a zvol_state_t or a zfs_onexit_t. */ enum zfs_soft_state_type { ZSST_ZVOL, ZSST_CTLDEV }; typedef struct zfs_soft_state { enum zfs_soft_state_type zss_type; void *zss_data; } zfs_soft_state_t; extern void *zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which); extern minor_t zfsdev_minor_alloc(void); extern void *zfsdev_state; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 351077) @@ -1,461 +1,461 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_ZIL_H #define _SYS_ZIL_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct dsl_pool; struct dsl_dataset; struct lwb; /* * Intent log format: * * Each objset has its own intent log. The log header (zil_header_t) * for objset N's intent log is kept in the Nth object of the SPA's * intent_log objset. The log header points to a chain of log blocks, * each of which contains log records (i.e., transactions) followed by * a log block trailer (zil_trailer_t). The format of a log record * depends on the record (or transaction) type, but all records begin * with a common structure that defines the type, length, and txg. */ /* * Intent log header - this on disk structure holds fields to manage * the log. All fields are 64 bit to easily handle cross architectures. */ typedef struct zil_header { uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ uint64_t zh_flags; /* header flags */ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ uint64_t zh_pad[3]; } zil_header_t; /* * zh_flags bit settings */ #define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ #define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ /* * Log block chaining. * * Log blocks are chained together. Originally they were chained at the * end of the block. For performance reasons the chain was moved to the * beginning of the block which allows writes for only the data being used. * The older position is supported for backwards compatability. * * The zio_eck_t contains a zec_cksum which for the intent log is * the sequence number of this log block. A seq of 0 is invalid. * The zec_cksum is checked by the SPA against the sequence * number passed in the blk_cksum field of the blkptr_t */ typedef struct zil_chain { uint64_t zc_pad; blkptr_t zc_next_blk; /* next block in chain */ uint64_t zc_nused; /* bytes in log block used */ zio_eck_t zc_eck; /* block trailer */ } zil_chain_t; #define ZIL_MIN_BLKSZ 4096ULL /* * ziltest is by and large an ugly hack, but very useful in * checking replay without tedious work. * When running ziltest we want to keep all itx's and so maintain * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG * We subtract TXG_CONCURRENT_STATES to allow for common code. */ #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) /* * The words of a log block checksum. */ #define ZIL_ZC_GUID_0 0 #define ZIL_ZC_GUID_1 1 #define ZIL_ZC_OBJSET 2 #define ZIL_ZC_SEQ 3 typedef enum zil_create { Z_FILE, Z_DIR, Z_XATTRDIR, } zil_create_t; /* * size of xvattr log section. * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps * for create time and a single 64 bit integer for all of the attributes, * and 4 64 bit integers (32 bytes) for the scanstamp. * */ #define ZIL_XVAT_SIZE(mapsize) \ sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ (sizeof (uint64_t) * 7) /* * Size of ACL in log. The ACE data is padded out to properly align * on 8 byte boundary. */ #define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) /* * Intent log transaction types and record structures */ #define TX_COMMIT 0 /* Commit marker (no on-disk state) */ #define TX_CREATE 1 /* Create file */ #define TX_MKDIR 2 /* Make directory */ #define TX_MKXATTR 3 /* Make XATTR directory */ #define TX_SYMLINK 4 /* Create symbolic link to a file */ #define TX_REMOVE 5 /* Remove file */ #define TX_RMDIR 6 /* Remove directory */ #define TX_LINK 7 /* Create hard link to a file */ #define TX_RENAME 8 /* Rename a file */ #define TX_WRITE 9 /* File write */ #define TX_TRUNCATE 10 /* Truncate a file */ #define TX_SETATTR 11 /* Set file attributes */ #define TX_ACL_V0 12 /* Set old formatted ACL */ #define TX_ACL 13 /* Set ACL */ #define TX_CREATE_ACL 14 /* create with ACL */ #define TX_CREATE_ATTR 15 /* create + attrs */ -#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ #define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename * may have the following bit set, indicating the original request * specified case-insensitive handling of names. */ #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* * Transactions for write, truncate, setattr, acl_v0, and acl can be logged * out of order. For convenience in the code, all such records must have * lr_foid at the same offset. */ #define TX_OOO(txtype) \ ((txtype) == TX_WRITE || \ (txtype) == TX_TRUNCATE || \ (txtype) == TX_SETATTR || \ (txtype) == TX_ACL_V0 || \ (txtype) == TX_ACL || \ (txtype) == TX_WRITE2) /* * The number of dnode slots consumed by the object is stored in the 8 * unused upper bits of the object ID. We subtract 1 from the value * stored on disk for compatibility with implementations that don't * support large dnodes. The slot count for a single-slot dnode will * contain 0 for those bits to preserve the log record format for * "small" dnodes. */ #define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1) #define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1) #define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT) #define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x)) /* * Format of log records. * The fields are carefully defined to allow them to be aligned * and sized the same on sparc & intel architectures. * Each log record has a common structure at the beginning. * * The log record on disk (lrc_seq) holds the sequence number of all log * records which is used to ensure we don't replay the same record. */ typedef struct { /* common log record header */ uint64_t lrc_txtype; /* intent log transaction type */ uint64_t lrc_reclen; /* transaction record length */ uint64_t lrc_txg; /* dmu transaction group number */ uint64_t lrc_seq; /* see comment above */ } lr_t; /* * Common start of all out-of-order record types (TX_OOO() above). */ typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* object id */ } lr_ooo_t; /* * Handle option extended vattr attributes. * * Whenever new attributes are added the version number * will need to be updated as will code in * zfs_log.c and zfs_replay.c */ typedef struct { uint32_t lr_attr_masksize; /* number of elements in array */ uint32_t lr_attr_bitmap; /* First entry of array */ /* remainder of array and any additional fields */ } lr_attr_t; /* * log record for creates without optional ACL. * This log record does support optional xvattr_t attributes. */ typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_doid; /* object id of directory */ uint64_t lr_foid; /* object id of created file object */ uint64_t lr_mode; /* mode of object */ uint64_t lr_uid; /* uid of object */ uint64_t lr_gid; /* gid of object */ uint64_t lr_gen; /* generation (txg of creation) */ uint64_t lr_crtime[2]; /* creation time */ uint64_t lr_rdev; /* rdev of object to create */ /* name of object to create follows this */ /* for symlinks, link content follows name */ /* for creates with xvattr data, the name follows the xvattr info */ } lr_create_t; /* * FUID ACL record will be an array of ACEs from the original ACL. * If this array includes ephemeral IDs, the record will also include * an array of log-specific FUIDs to replace the ephemeral IDs. * Only one copy of each unique domain will be present, so the log-specific * FUIDs will use an index into a compressed domain table. On replay this * information will be used to construct real FUIDs (and bypass idmap, * since it may not be available). */ /* * Log record for creates with optional ACL * This log record is also used for recording any FUID * information needed for replaying the create. If the * file doesn't have any actual ACEs then the lr_aclcnt * would be zero. * * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. * If create is also setting xvattr's, then acl data follows xvattr. * If ACE FUIDs are needed then they will follow the xvattr_t. Following * the FUIDs will be the domain table information. The FUIDs for the owner * and group will be in lr_create. Name follows ACL data. */ typedef struct { lr_create_t lr_create; /* common create portion */ uint64_t lr_aclcnt; /* number of ACEs in ACL */ uint64_t lr_domcnt; /* number of unique domains */ uint64_t lr_fuidcnt; /* number of real fuids */ uint64_t lr_acl_bytes; /* number of bytes in ACL */ uint64_t lr_acl_flags; /* ACL flags */ } lr_acl_create_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_doid; /* obj id of directory */ /* name of object to remove follows this */ } lr_remove_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_doid; /* obj id of directory */ uint64_t lr_link_obj; /* obj id of link */ /* name of object to link follows this */ } lr_link_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_sdoid; /* obj id of source directory */ uint64_t lr_tdoid; /* obj id of target directory */ /* 2 strings: names of source and destination follow this */ } lr_rename_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* file object to write */ uint64_t lr_offset; /* offset to write to */ uint64_t lr_length; /* user data length to write */ uint64_t lr_blkoff; /* no longer used */ blkptr_t lr_blkptr; /* spa block pointer for replay */ /* write data will follow for small writes */ } lr_write_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* object id of file to truncate */ uint64_t lr_offset; /* offset to truncate from */ uint64_t lr_length; /* length to truncate */ } lr_truncate_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* file object to change attributes */ uint64_t lr_mask; /* mask of attributes to set */ uint64_t lr_mode; /* mode to set */ uint64_t lr_uid; /* uid to set */ uint64_t lr_gid; /* gid to set */ uint64_t lr_size; /* size to set */ uint64_t lr_atime[2]; /* access time */ uint64_t lr_mtime[2]; /* modification time */ /* optional attribute lr_attr_t may be here */ } lr_setattr_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* obj id of file */ uint64_t lr_aclcnt; /* number of acl entries */ /* lr_aclcnt number of ace_t entries follow this */ } lr_acl_v0_t; typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* obj id of file */ uint64_t lr_aclcnt; /* number of ACEs in ACL */ uint64_t lr_domcnt; /* number of unique domains */ uint64_t lr_fuidcnt; /* number of real fuids */ uint64_t lr_acl_bytes; /* number of bytes in ACL */ uint64_t lr_acl_flags; /* ACL flags */ /* lr_acl_bytes number of variable sized ace's follows */ } lr_acl_t; /* * ZIL structure definitions, interface function prototype and globals. */ /* * Writes are handled in three different ways: * * WR_INDIRECT: * In this mode, if we need to commit the write later, then the block * is immediately written into the file system (using dmu_sync), * and a pointer to the block is put into the log record. * When the txg commits the block is linked in. * This saves additionally writing the data into the log record. * There are a few requirements for this to occur: * - write is greater than zfs/zvol_immediate_write_sz * - not using slogs (as slogs are assumed to always be faster * than writing into the main pool) * - the write occupies only one block * WR_COPIED: * If we know we'll immediately be committing the * transaction (FSYNC or FDSYNC), the we allocate a larger * log record here for the data and copy the data in. * WR_NEED_COPY: * Otherwise we don't allocate a buffer, and *if* we need to * flush the write later then a buffer is allocated and * we retrieve the data using the dmu. */ typedef enum { WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ /* and put blkptr in log, rather than actual data) */ WR_COPIED, /* immediate - data is copied into lr_write_t */ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ WR_NUM_STATES /* number of states */ } itx_wr_state_t; typedef struct itx { list_node_t itx_node; /* linkage on zl_itx_list */ void *itx_private; /* type-specific opaque data */ itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg); typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, struct lwb *lwb, zio_t *zio); extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); extern void zil_fini(void); extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); extern void zil_free(zilog_t *zilog); extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]); extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); extern void zil_itx_destroy(itx_t *itx); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); extern void zil_commit(zilog_t *zilog, uint64_t oid); extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); -extern int zil_check_log_chain(struct dsl_pool *dp, +extern int zil_check_log_chain(struct dsl_pool *dp, struct dsl_dataset *ds, void *tx); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(const char *osname, void **cookiep); extern void zil_resume(void *cookie); extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern int zil_replay_disable; #ifdef __cplusplus } #endif #endif /* _SYS_ZIL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c (revision 351077) @@ -1,1334 +1,1334 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. * * The zdir is an extendable hash data structure. There is a table of * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are * each a constant size and hold a variable number of directory entries. * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. * * The pointer table holds a power of 2 number of pointers. * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to * by the pointer at index i in the table holds entries whose hash value * has a zd_prefix_len - bit prefix */ #include #include #include #include #include #include #include #include #include int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) { uint64_t block_type = *(uint64_t *)vbuf; if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) zap_leaf_byteswap(vbuf, size); else { /* it's a ptrtbl block */ byteswap_uint64_array(vbuf, size); } } void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; zap->zap_dbu.dbu_evict_func_async = NULL; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; zap_phys_t *zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap */ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); zp->zap_freeblk = 2; /* block 1 will be the first leaf */ zp->zap_num_leafs = 1; zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; zp->zap_flags = flags; /* block 1 will be the first leaf */ for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; /* * set up block 1 - the first leaf */ dmu_buf_t *db; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, 1<l_dbuf = db; zap_leaf_init(l, zp->zap_normflags != 0); kmem_free(l, sizeof (zap_leaf_t)); dmu_buf_rele(db, FTAG); } static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) { if (RW_WRITE_HELD(&zap->zap_rwlock)) return (1); if (rw_tryupgrade(&zap->zap_rwlock)) { dmu_buf_will_dirty(zap->zap_dbuf, tx); return (1); } return (0); } /* * Generic routines for dealing with the pointer & cookie tables. */ static int zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), dmu_tx_t *tx) { uint64_t newblk; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); ASSERT(tbl->zt_numblks > 0); if (tbl->zt_nextblk != 0) { newblk = tbl->zt_nextblk; } else { newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } /* * Copy the ptrtbl from the old to new location. */ uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); tbl->zt_blks_copied++; dprintf("copied block %llu of %llu\n", tbl->zt_blks_copied, tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { (void) dmu_free_range(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); tbl->zt_blk = newblk; tbl->zt_numblks *= 2; tbl->zt_shift++; tbl->zt_nextblk = 0; tbl->zt_blks_copied = 0; dprintf("finished; numblocks now %llu (%lluk entries)\n", tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } return (0); } static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); dprintf("storing %llx at index %llx\n", val, idx); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db, tx); if (tbl->zt_nextblk != 0) { uint64_t idx2 = idx * 2; uint64_t blk2 = idx2 >> (bs-3); uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_buf_will_dirty(db2, tx); ((uint64_t *)db2->db_data)[off2] = val; ((uint64_t *)db2->db_data)[off2+1] = val; dmu_buf_rele(db2, FTAG); } ((uint64_t *)db->db_data)[off] = val; dmu_buf_rele(db, FTAG); return (0); } static int zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); /* * Note: this is equivalent to dmu_buf_hold(), but we use * _dnode_enter / _by_dnode because it's faster because we don't * have to hold the dnode. */ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(dn, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { /* * read the nextblk for the sake of i/o error checking, * so that zap_table_load() will catch errors for * zap_table_store. */ blk = (idx*2) >> (bs-3); dn = dmu_buf_dnode_enter(zap->zap_dbuf); err = dmu_buf_hold_by_dnode(dn, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } return (err); } /* * Routines for growing the ptrtbl. */ static void zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) { for (int i = 0; i < n; i++) { uint64_t lb = src[i]; dst[2 * i + 0] = lb; dst[2 * i + 1] = lb; } } static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { /* * The pointer table should never use more hash bits than we * have (otherwise we'd be using useless zero bits to index it). * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (SET_ERROR(ENOSPC)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire * block, which will double the size of the ptrtbl. */ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; zap_f_phys(zap)->zap_ptrtbl.zt_shift++; ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } static void zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); uint64_t newblk = zap_f_phys(zap)->zap_freeblk; zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } static void zap_leaf_evict_sync(void *dbu) { zap_leaf_t *l = dbu; rw_destroy(&l->l_rwlock); kmem_free(l, sizeof (zap_leaf_t)); } static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); zap_f_phys(zap)->zap_num_leafs++; return (l); } int fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } /* * Routines for obtaining zap_leaf_t's */ void zap_put_leaf(zap_leaf_t *l) { rw_exit(&l->l_rwlock); dmu_buf_rele(l->l_dbuf, NULL); } static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { ASSERT(blkid != 0); zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ zap_leaf_evict_sync(&l->l_dbu); l = winner; } /* * lhr_pad was previously used for the next leaf in the leaf * chain. There should be no chained leafs (as we have removed * support for them). */ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be * chunks to put in the hash table */ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); int bs = FZAP_BLOCK_SHIFT(zap); dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); int err = dmu_buf_hold_by_dnode(dn, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); ASSERT3U(db->db_object, ==, zap->zap_object); ASSERT3U(db->db_offset, ==, blkid << bs); ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); rw_enter(&l->l_rwlock, lt); /* * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); } static int zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } static int zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) { ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { uint64_t blk; ASSERT(zap->zap_dbuf == NULL || zap_f_phys(zap) == zap->zap_dbuf->db_data); /* Reality check for corrupt zap objects (leaf or header). */ if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { return (SET_ERROR(EIO)); } uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); int err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); ASSERT(err || ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; int err; int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; zap_put_leaf(l); zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err != 0) return (err); } err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); if (err != 0) return (err); if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); uint64_t sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; /* check for i/o errors before doing zap_leaf_split */ for (int i = 0; i < (1ULL << prefix_diff); i++) { uint64_t blk; err = zap_idx_to_blk(zap, sibling + i, &blk); if (err != 0) return (err); ASSERT3U(blk, ==, l->l_blkid); } zap_leaf_t *nl = zap_create_leaf(zap, tx); zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ for (int i = 0; i < (1ULL << prefix_diff); i++) { err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); ASSERT0(err); /* we checked for i/o errors above */ } if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; } else { zap_put_leaf(nl); *lp = l; } return (0); } static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { /* * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. */ if (zap_tryupgradedir(zap, tx) == 0) { objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; zap_unlockdir(zap, tag); int err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return; } /* could have finished growing while our locks were down */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } static int fzap_checkname(zap_name_t *zn) { if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int fzap_checksize(uint64_t integer_size, uint64_t num_integers) { /* Only integer sizes supported by C */ switch (integer_size) { case 1: case 2: case 4: case 8: break; default: return (SET_ERROR(EINVAL)); } if (integer_size * num_integers > ZAP_MAXVALUELEN) return (E2BIG); return (0); } static int fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) { int err = fzap_checkname(zn); if (err != 0) return (err); return (fzap_checksize(integer_size, num_integers)); } /* * Routines for manipulating attributes. */ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; zap_entry_handle_t zeh; int err = fzap_checkname(zn); if (err != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { if ((err = fzap_checksize(integer_size, num_integers)) != 0) { zap_put_leaf(l); return (err); } err = zap_entry_read(&zeh, integer_size, num_integers, buf); (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); } } zap_put_leaf(l); return (err); } int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } out: if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, val, ZAP_NEED_CD, tag, tx)); } int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; boolean_t create; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); create = (err == ENOENT); ASSERT(err == 0 || err == ENOENT); if (create) { err = zap_entry_create(l, zn, ZAP_NEED_CD, integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { err = zap_entry_update(&zeh, integer_size, num_integers, val); } if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) goto out; if (integer_size != 0) *integer_size = zeh.zeh_integer_size; if (num_integers != 0) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); return (err); } int fzap_remove(zap_name_t *zn, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); } zap_put_leaf(l); return (err); } void fzap_prefetch(zap_name_t *zn) { uint64_t blk; zap_t *zap = zn->zn_zap; uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } /* * Helper functions for consumers. */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); } uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx) { - uint64_t new_obj; - + uint64_t new_obj; + VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx)) > 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); return (new_obj); } int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) { zap_cursor_t zc; int err; if (mask == 0) mask = -1ULL; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { if ((za->za_first_integer & mask) == (value & mask)) { (void) strcpy(name, za->za_name); break; } } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &za->za_first_integer, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &value, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) break; delta += za->za_first_integer; err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_remove(os, obj, name, tx)); } int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_lookup(os, obj, name, 8, 1, &value)); } int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_update(os, obj, name, 8, 1, &value, tx)); } int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_lookup(os, obj, name, 8, 1, valuep)); } int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; if (delta == 0) return (0); int err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; if (value == 0) err = zap_remove(os, obj, name, tx); else err = zap_update(os, obj, name, 8, 1, &value, tx); return (err); } int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_increment(os, obj, name, delta, tx)); } /* * Routines for iterating over the attributes. */ int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { int err = ENOENT; zap_entry_handle_t zeh; zap_leaf_t *l; /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } again: if (zc->zc_leaf == NULL) { err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, &zc->zc_leaf); if (err != 0) return (err); } else { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); if (err == ENOENT) { uint64_t nocare = (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; goto again; } } if (err == 0) { zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; za->za_integer_length = zeh.zeh_integer_size; za->za_num_integers = zeh.zeh_num_integers; if (zeh.zeh_num_integers == 0) { za->za_first_integer = 0; } else { err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } err = zap_entry_read_name(zap, &zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); za->za_normalization_conflict = zap_entry_normalization_conflict(&zeh, NULL, za->za_name, zap); } rw_exit(&zc->zc_leaf->l_rwlock); return (err); } static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { uint64_t lastblk = 0; /* * NB: if a leaf has more pointers than an entire ptrtbl block * can hold, then it'll be accounted for more than once, since * we won't have lastblk. */ for (int i = 0; i < len; i++) { zap_leaf_t *l; if (tbl[i] == lastblk) continue; lastblk = tbl[i]; int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); if (err == 0) { zap_leaf_stats(zap, l, zs); zap_put_leaf(l); } } } int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) { int err; zap_leaf_t *l; zap_entry_handle_t zeh; if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) return (err); zc->zc_leaf = l; zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; return (err); } void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { int bs = FZAP_BLOCK_SHIFT(zap); zs->zs_blocksize = 1ULL << bs; /* * Set zap_phys_t fields */ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; zs->zs_block_type = zap_f_phys(zap)->zap_block_type; zs->zs_magic = zap_f_phys(zap)->zap_magic; zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } } } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 351077) @@ -1,1588 +1,1588 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif extern inline mzap_phys_t *zap_m_phys(zap_t *zap); static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (int i = 0; i < zn->zn_key_norm_numints; wp++, i++) { uint64_t word = *wp; for (int j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * chosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) { ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); size_t inlen = strlen(name) + 1; size_t outlen = ZAP_MAXNAMELEN; int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { return (strcmp(zn->zn_key_orig, matchname) == 0); } } void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } zap_name_t * zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, zap->zap_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, zn->zn_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } return (zn); } zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); int max = (size / MZAP_ENT_LEN) - 1; for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); if (likely(cmp)) return (cmp); return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); } static int mze_insert(zap_t *zap, int chunkid, uint64_t hash) { avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { kmem_free(mze, sizeof (mzap_ent_t)); return (EEXIST); } avl_insert(&zap->zap_m.zap_avl, mze, idx); return (0); } static mzap_ent_t * mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_cd = 0; mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } static void mze_remove(zap_t *zap, mzap_ent_t *mze) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); avl_remove(&zap->zap_m.zap_avl, mze); kmem_free(mze, sizeof (mzap_ent_t)); } static void mze_destroy(zap_t *zap) { mzap_ent_t *mze; void *avlcookie = NULL; while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; zn = zap_name_alloc(zap, mze->mze_name, 0); if (mze_insert(zap, i, zn->zn_hash) == 0) zap->zap_m.zap_num_entries++; else { printf("ZFS WARNING: Duplicated ZAP " "entry detected (%s).\n", mze->mze_name); } zap_name_free(zn); } } } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } /* * This routine "consumes" the caller's hold on the dbuf, which must * have the specified tag. */ static int zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; *zapp = NULL; zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(os, obj, db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == (!zap->zap_ismicro && fatreader) ? RW_READER : lti); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { return (err); } #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); } return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); return (err); } void zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { zio_buf_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err != 0) break; } zio_buf_free(mzp, sz); *zapp = zap; return (err); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } else { dmu_buf_rele(db, FTAG); } } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, 0, ot, bonustype, bonuslen, dnodesize, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - int err; - + { + int err; + err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, dnodesize, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, dnodesize, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); return (zap_create_flags_dnsize(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, dnodesize, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); again: for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (direction == AVL_BEFORE) { direction = AVL_AFTER; goto again; } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); return; } } if (start != 0) { start = 0; goto again; } ASSERT(!"out of entries!"); } static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, void *tag) { const uint64_t *intval = val; int err = 0; zap_name_t *zn = zap_name_alloc(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { if (mze_find(zn) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; uint64_t oldval; const uint64_t *intval = val; #ifdef ZFS_DEBUG /* * If there is an old value, it shouldn't change across the * lockdir (eg, due to bprewrite's xlation). */ if (integer_size == 8 && num_integers == 1) (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mzap_ent_t *mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { int err = 0; zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); return (err); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, mt, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } /* * Routines for iterating over the attributes. */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; } void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_serialized(zc, os, zapobj, 0); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err != 0) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { avl_index_t idx; mzap_ent_t mze_tofind; mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; mzap_ent_t *mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) { int err = 0; mzap_ent_t *mze; zap_name_t *zn; if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); if (err) return (err); } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } zn = zap_name_alloc(zc->zc_zap, name, mt); if (zn == NULL) { rw_exit(&zc->zc_zap->zap_rwlock); return (SET_ERROR(ENOTSUP)); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_move_to_key(zc, zn); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); goto out; } zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; } out: zap_name_free(zn); rw_exit(&zc->zc_zap->zap_rwlock); return (err); } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c (revision 351077) @@ -1,2740 +1,2740 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE #define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE #define MIN_ACE_TYPE ALLOW #define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) #define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) #define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) #define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) #define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ ZFS_ACL_PROTECTED) #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) #define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) static uint16_t zfs_ace_v0_get_type(void *acep) { return (((zfs_oldace_t *)acep)->z_type); } static uint16_t zfs_ace_v0_get_flags(void *acep) { return (((zfs_oldace_t *)acep)->z_flags); } static uint32_t zfs_ace_v0_get_mask(void *acep) { return (((zfs_oldace_t *)acep)->z_access_mask); } static uint64_t zfs_ace_v0_get_who(void *acep) { return (((zfs_oldace_t *)acep)->z_fuid); } static void zfs_ace_v0_set_type(void *acep, uint16_t type) { ((zfs_oldace_t *)acep)->z_type = type; } static void zfs_ace_v0_set_flags(void *acep, uint16_t flags) { ((zfs_oldace_t *)acep)->z_flags = flags; } static void zfs_ace_v0_set_mask(void *acep, uint32_t mask) { ((zfs_oldace_t *)acep)->z_access_mask = mask; } static void zfs_ace_v0_set_who(void *acep, uint64_t who) { ((zfs_oldace_t *)acep)->z_fuid = who; } /*ARGSUSED*/ static size_t zfs_ace_v0_size(void *acep) { return (sizeof (zfs_oldace_t)); } static size_t zfs_ace_v0_abstract_size(void) { return (sizeof (zfs_oldace_t)); } static int zfs_ace_v0_mask_off(void) { return (offsetof(zfs_oldace_t, z_access_mask)); } /*ARGSUSED*/ static int zfs_ace_v0_data(void *acep, void **datap) { *datap = NULL; return (0); } static acl_ops_t zfs_acl_v0_ops = { zfs_ace_v0_get_mask, zfs_ace_v0_set_mask, zfs_ace_v0_get_flags, zfs_ace_v0_set_flags, zfs_ace_v0_get_type, zfs_ace_v0_set_type, zfs_ace_v0_get_who, zfs_ace_v0_set_who, zfs_ace_v0_size, zfs_ace_v0_abstract_size, zfs_ace_v0_mask_off, zfs_ace_v0_data }; static uint16_t zfs_ace_fuid_get_type(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_type); } static uint16_t zfs_ace_fuid_get_flags(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_flags); } static uint32_t zfs_ace_fuid_get_mask(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_access_mask); } static uint64_t zfs_ace_fuid_get_who(void *args) { uint16_t entry_type; zfs_ace_t *acep = args; entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (-1); return (((zfs_ace_t *)acep)->z_fuid); } static void zfs_ace_fuid_set_type(void *acep, uint16_t type) { ((zfs_ace_hdr_t *)acep)->z_type = type; } static void zfs_ace_fuid_set_flags(void *acep, uint16_t flags) { ((zfs_ace_hdr_t *)acep)->z_flags = flags; } static void zfs_ace_fuid_set_mask(void *acep, uint32_t mask) { ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; } static void zfs_ace_fuid_set_who(void *arg, uint64_t who) { zfs_ace_t *acep = arg; uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return; acep->z_fuid = who; } static size_t zfs_ace_fuid_size(void *acep) { zfs_ace_hdr_t *zacep = acep; uint16_t entry_type; switch (zacep->z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: return (sizeof (zfs_object_ace_t)); case ALLOW: case DENY: entry_type = (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (sizeof (zfs_ace_hdr_t)); /*FALLTHROUGH*/ default: return (sizeof (zfs_ace_t)); } } static size_t zfs_ace_fuid_abstract_size(void) { return (sizeof (zfs_ace_hdr_t)); } static int zfs_ace_fuid_mask_off(void) { return (offsetof(zfs_ace_hdr_t, z_access_mask)); } static int zfs_ace_fuid_data(void *acep, void **datap) { zfs_ace_t *zacep = acep; zfs_object_ace_t *zobjp; switch (zacep->z_hdr.z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjp = acep; *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); default: *datap = NULL; return (0); } } static acl_ops_t zfs_acl_fuid_ops = { zfs_ace_fuid_get_mask, zfs_ace_fuid_set_mask, zfs_ace_fuid_get_flags, zfs_ace_fuid_set_flags, zfs_ace_fuid_get_type, zfs_ace_fuid_set_type, zfs_ace_fuid_get_who, zfs_ace_fuid_set_who, zfs_ace_fuid_size, zfs_ace_fuid_abstract_size, zfs_ace_fuid_mask_off, zfs_ace_fuid_data }; /* * The following three functions are provided for compatibility with * older ZPL version in order to determine if the file use to have * an external ACL and what version of ACL previously existed on the * file. Would really be nice to not need this, sigh. */ uint64_t zfs_external_acl(znode_t *zp) { zfs_acl_phys_t acl_phys; int error; if (zp->z_is_sa) return (0); /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_extern_obj); else { /* * after upgrade the SA_ZPL_ZNODE_ACL should have been * removed */ VERIFY(zp->z_is_sa && error == ENOENT); return (0); } } /* * Determine size of ACL in bytes * * This is more complicated than it should be since we have to deal * with old external ACLs. */ static int zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, zfs_acl_phys_t *aclphys) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t acl_count; int size; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_is_sa) { if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), &size)) != 0) return (error); *aclsize = size; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), &acl_count, sizeof (acl_count))) != 0) return (error); *aclcount = acl_count; } else { if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), aclphys, sizeof (*aclphys))) != 0) return (error); if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); *aclcount = aclphys->z_acl_size; } else { *aclsize = aclphys->z_acl_size; *aclcount = aclphys->z_acl_count; } } return (0); } int zfs_znode_acl_version(znode_t *zp) { zfs_acl_phys_t acl_phys; if (zp->z_is_sa) return (ZFS_ACL_VERSION_FUID); else { int error; /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_version); else { /* * After upgrade SA_ZPL_ZNODE_ACL should have * been removed. */ VERIFY(zp->z_is_sa && error == ENOENT); return (ZFS_ACL_VERSION_FUID); } } } static int zfs_acl_version(int version) { if (version < ZPL_VERSION_FUID) return (ZFS_ACL_VERSION_INITIAL); else return (ZFS_ACL_VERSION_FUID); } static int zfs_acl_version_zp(znode_t *zp) { return (zfs_acl_version(zp->z_zfsvfs->z_version)); } zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), offsetof(zfs_acl_node_t, z_next)); aclp->z_version = vers; if (vers == ZFS_ACL_VERSION_FUID) aclp->z_ops = zfs_acl_fuid_ops; else aclp->z_ops = zfs_acl_v0_ops; return (aclp); } zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); if (bytes) { aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); aclnode->z_allocdata = aclnode->z_acldata; aclnode->z_allocsize = bytes; aclnode->z_size = bytes; } return (aclnode); } static void zfs_acl_node_free(zfs_acl_node_t *aclnode) { if (aclnode->z_allocsize) kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); kmem_free(aclnode, sizeof (zfs_acl_node_t)); } static void zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; while (aclnode = list_head(&aclp->z_acl)) { list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } void zfs_acl_free(zfs_acl_t *aclp) { zfs_acl_release_nodes(aclp); list_destroy(&aclp->z_acl); kmem_free(aclp, sizeof (zfs_acl_t)); } static boolean_t zfs_acl_valid_ace_type(uint_t type, uint_t flags) { uint16_t entry_type; switch (type) { case ALLOW: case DENY: case ACE_SYSTEM_AUDIT_ACE_TYPE: case ACE_SYSTEM_ALARM_ACE_TYPE: entry_type = flags & ACE_TYPE_FLAGS; return (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE || entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP); default: if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) return (B_TRUE); } return (B_FALSE); } static boolean_t zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) { /* * first check type of entry */ if (!zfs_acl_valid_ace_type(type, iflags)) return (B_FALSE); switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (aclp->z_version < ZFS_ACL_VERSION_FUID) return (B_FALSE); aclp->z_hints |= ZFS_ACL_OBJ_ACE; } /* * next check inheritance level flags */ if (obj_type == VDIR && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { if ((iflags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE)) == 0) { return (B_FALSE); } } return (B_TRUE); } static void * zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, uint32_t *access_mask, uint16_t *iflags, uint16_t *type) { zfs_acl_node_t *aclnode; ASSERT(aclp); if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) return (NULL); aclp->z_next_ace = aclnode->z_acldata; aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; } aclnode = aclp->z_curr_node; if (aclnode == NULL) return (NULL); if (aclnode->z_ace_idx >= aclnode->z_ace_count) { aclnode = list_next(&aclp->z_acl, aclnode); if (aclnode == NULL) return (NULL); else { aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; aclp->z_next_ace = aclnode->z_acldata; } } if (aclnode->z_ace_idx < aclnode->z_ace_count) { void *acep = aclp->z_next_ace; size_t ace_size; /* * Make sure we don't overstep our bounds */ ace_size = aclp->z_ops.ace_size(acep); if (((caddr_t)acep + ace_size) > ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { return (NULL); } *iflags = aclp->z_ops.ace_flags_get(acep); *type = aclp->z_ops.ace_type_get(acep); *access_mask = aclp->z_ops.ace_mask_get(acep); *who = aclp->z_ops.ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; return ((void *)acep); } return (NULL); } /*ARGSUSED*/ static uint64_t zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { zfs_acl_t *aclp = datap; zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); return ((uint64_t)(uintptr_t)acep); } static zfs_acl_node_t * zfs_acl_curr_node(zfs_acl_t *aclp) { ASSERT(aclp->z_curr_node); return (aclp->z_curr_node); } /* * Copy ACE to internal ZFS format. * While processing the ACL each ACE will be validated for correctness. * ACE FUIDs will be created later. */ int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; zfs_ace_t *aceptr = z_acl; ace_t *acep = datap; zfs_object_ace_t *zobjacep; ace_object_t *aceobjp; for (i = 0; i != aclcnt; i++) { aceptr->z_hdr.z_access_mask = acep->a_access_mask; aceptr->z_hdr.z_flags = acep->a_flags; aceptr->z_hdr.z_type = acep->a_type; entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, cr, (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, aceptr->z_hdr.z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); switch (acep->a_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjacep = (zfs_object_ace_t *)aceptr; aceobjp = (ace_object_t *)acep; bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, sizeof (aceobjp->a_obj_type)); bcopy(aceobjp->a_inherit_obj_type, zobjacep->z_inherit_type, sizeof (aceobjp->a_inherit_obj_type)); acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); break; default: acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); } aceptr = (zfs_ace_t *)((caddr_t)aceptr + aclp->z_ops.ace_size(aceptr)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * Copy ZFS ACEs to fixed size ace_t layout */ static void zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, void *datap, int filter) { uint64_t who; uint32_t access_mask; uint16_t iflags, type; zfs_ace_hdr_t *zacep = NULL; ace_t *acep = datap; ace_object_t *objacep; zfs_object_ace_t *zobjacep; size_t ace_size; uint16_t entry_type; while (zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (filter) { continue; } zobjacep = (zfs_object_ace_t *)zacep; objacep = (ace_object_t *)acep; bcopy(zobjacep->z_object_type, objacep->a_obj_type, sizeof (zobjacep->z_object_type)); bcopy(zobjacep->z_inherit_type, objacep->a_inherit_obj_type, sizeof (zobjacep->z_inherit_type)); ace_size = sizeof (ace_object_t); break; default: ace_size = sizeof (ace_t); break; } entry_type = (iflags & ACE_TYPE_FLAGS); if ((entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE)) { acep->a_who = zfs_fuid_map_id(zfsvfs, who, cr, (entry_type & ACE_IDENTIFIER_GROUP) ? ZFS_ACE_GROUP : ZFS_ACE_USER); } else { acep->a_who = (uid_t)(int64_t)who; } acep->a_access_mask = access_mask; acep->a_flags = iflags; acep->a_type = type; acep = (ace_t *)((caddr_t)acep + ace_size); } } static int zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, zfs_oldace_t *z_acl, int aclcnt, size_t *size) { int i; zfs_oldace_t *aceptr = z_acl; for (i = 0; i != aclcnt; i++, aceptr++) { aceptr->z_access_mask = acep[i].a_access_mask; aceptr->z_type = acep[i].a_type; aceptr->z_flags = acep[i].a_flags; aceptr->z_fuid = acep[i].a_who; /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, aceptr->z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * convert old ACL format to new */ void zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; uint16_t type, iflags; uint32_t access_mask; uint64_t who; void *cookie = NULL; zfs_acl_node_t *newaclnode; ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); /* * First create the ACE in a contiguous piece of memory * for zfs_copy_ace_2_fuid(). * * We only convert an ACL once, so this won't happen * everytime. */ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, KM_SLEEP); i = 0; while (cookie = zfs_acl_next_ace(aclp, cookie, &who, &access_mask, &iflags, &type)) { oldaclp[i].z_flags = iflags; oldaclp[i].z_type = type; oldaclp[i].z_fuid = who; oldaclp[i++].z_access_mask = access_mask; } newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = zfs_acl_fuid_ops; VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); /* * Release all previous ACL nodes */ zfs_acl_release_nodes(aclp); list_insert_head(&aclp->z_acl, newaclnode); aclp->z_acl_bytes = newaclnode->z_size; aclp->z_acl_count = newaclnode->z_ace_count; } /* * Convert unix access mask to v4 access mask */ static uint32_t zfs_unix_to_v4(uint32_t access_mask) { uint32_t new_mask = 0; if (access_mask & S_IXOTH) new_mask |= ACE_EXECUTE; if (access_mask & S_IWOTH) new_mask |= ACE_WRITE_DATA; if (access_mask & S_IROTH) new_mask |= ACE_READ_DATA; return (new_mask); } static void zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, uint16_t access_type, uint64_t fuid, uint16_t entry_type) { uint16_t type = entry_type & ACE_TYPE_FLAGS; aclp->z_ops.ace_mask_set(acep, access_mask); aclp->z_ops.ace_type_set(acep, access_type); aclp->z_ops.ace_flags_set(acep, entry_type); if ((type != ACE_OWNER && type != OWNING_GROUP && type != ACE_EVERYONE)) aclp->z_ops.ace_who_set(acep, fuid); } /* * Determine mode of file based on ACL. */ uint64_t zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; + zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { if (!zfs_acl_valid_ace_type(type, iflags)) continue; entry_type = (iflags & ACE_TYPE_FLAGS); /* * Skip over any inherit_only ACEs */ if (iflags & ACE_INHERIT_ONLY_ACE) continue; if (entry_type == ACE_OWNER || (entry_type == 0 && who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWUSR))) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXUSR))) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } } else if (entry_type == OWNING_GROUP || (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWGRP))) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXGRP))) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } } else if (entry_type == ACE_EVERYONE) { if ((access_mask & ACE_READ_DATA)) { if (!(seen & S_IRUSR)) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if (!(seen & S_IRGRP)) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if (!(seen & S_IROTH)) { seen |= S_IROTH; if (type == ALLOW) { mode |= S_IROTH; } } } if ((access_mask & ACE_WRITE_DATA)) { if (!(seen & S_IWUSR)) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if (!(seen & S_IWGRP)) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if (!(seen & S_IWOTH)) { seen |= S_IWOTH; if (type == ALLOW) { mode |= S_IWOTH; } } } if ((access_mask & ACE_EXECUTE)) { if (!(seen & S_IXUSR)) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } if (!(seen & S_IXGRP)) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } if (!(seen & S_IXOTH)) { seen |= S_IXOTH; if (type == ALLOW) { mode |= S_IXOTH; } } } } else { /* * Only care if this IDENTIFIER_GROUP or * USER ACE denies execute access to someone, * mode is not affected */ if ((access_mask & ACE_EXECUTE) && type == DENY) an_exec_denied = B_TRUE; } } /* * Failure to allow is effectively a deny, so execute permission * is denied if it was never mentioned or if we explicitly * weren't allowed it. */ if (!an_exec_denied && ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) an_exec_denied = B_TRUE; if (an_exec_denied) *pflags &= ~ZFS_NO_EXECS_DENIED; else *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. */ static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) { zfs_acl_t *aclp; int aclsize; int acl_count; zfs_acl_node_t *aclnode; zfs_acl_phys_t znode_acl; int version; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ASSERT_VOP_LOCKED(ZTOV(zp), __func__); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, &acl_count, &znode_acl)) != 0) { goto done; } aclp = zfs_acl_alloc(version); aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; aclnode = zfs_acl_node_alloc(aclsize); aclnode->z_ace_count = aclp->z_acl_count; aclnode->z_size = aclsize; if (!zp->z_is_sa) { if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, aclnode->z_acldata, DMU_READ_PREFETCH); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); } } else { error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), aclnode->z_acldata, aclnode->z_size); } if (error != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); goto done; } list_insert_head(&aclp->z_acl, aclnode); *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; done: return (error); } /*ARGSUSED*/ void zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, boolean_t start, void *userdata) { zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; if (start) { cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); } else { cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } int zfs_acl_chown_setattr(znode_t *zp) { int error; zfs_acl_t *aclp; ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, &zp->z_pflags, zp->z_uid, zp->z_gid); return (error); } /* * common code for setting ACLs. * * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's * already checked the acl and knows whether to inherit. */ int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_type_t otype; zfs_acl_locator_cb_t locate = { 0 }; uint64_t mode; sa_bulk_attr_t bulk[5]; uint64_t ctime[2]; int count = 0; zfs_acl_phys_t acl_phys; mode = zp->z_mode; mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, zp->z_uid, zp->z_gid); zp->z_mode = mode; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } /* * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } /* * Arrgh, we have to handle old on disk format * as well as newer (preferred) SA format. */ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ locate.cb_aclp = aclp; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, aclp->z_acl_bytes); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &aclp->z_acl_count, sizeof (uint64_t)); } else { /* Painful legacy way */ zfs_acl_node_t *aclnode; uint64_t off = 0; uint64_t aoid; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), &acl_phys, sizeof (acl_phys))) != 0) return (error); aoid = acl_phys.z_acl_extern_obj; if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { /* * If ACL was previously external and we are now * converting to new ACL format then release old * ACL object and create a new one. */ if (aoid && aclp->z_version != acl_phys.z_acl_version) { error = dmu_object_free(zfsvfs->z_os, aoid, tx); if (error) return (error); aoid = 0; } if (aoid == 0) { aoid = dmu_object_alloc(zfsvfs->z_os, otype, aclp->z_acl_bytes, otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, aclp->z_acl_bytes, 0, tx); } acl_phys.z_acl_extern_obj = aoid; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, aclnode->z_size, aclnode->z_acldata, tx); off += aclnode->z_size; } } else { void *start = acl_phys.z_ace_data; /* * Migrating back embedded? */ if (acl_phys.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, acl_phys.z_acl_extern_obj, tx); if (error) return (error); acl_phys.z_acl_extern_obj = 0; } for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; bcopy(aclnode->z_acldata, start, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } } /* * If Old version then swap count/bytes to match old * layout of znode_acl_phys_t. */ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { acl_phys.z_acl_size = aclp->z_acl_count; acl_phys.z_acl_count = aclp->z_acl_bytes; } else { acl_phys.z_acl_size = aclp->z_acl_bytes; acl_phys.z_acl_count = aclp->z_acl_count; } acl_phys.z_acl_version = aclp->z_version; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (acl_phys)); } /* * Replace ACL wide bits, but first clear them. */ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_pflags |= ZFS_ACL_TRIVIAL; zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } static void zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; int new_count, new_bytes; int ace_size; - int entry_type; + int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; boolean_t isdir; trivial_acl_t masks; new_count = new_bytes = 0; isdir = (vtype == VDIR); acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; if (masks.allow0) { zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny1) { zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny2) { zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { entry_type = (iflags & ACE_TYPE_FLAGS); /* * ACEs used to represent the file mode may be divided * into an equivalent pair of inherit-only and regular * ACEs, if they are inheritable. * Skip regular ACEs, which are replaced by the new mode. */ if (split && (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE)) { if (!isdir || !(iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) continue; /* * We preserve owner@, group@, or @everyone * permissions, if they are inheritable, by * copying them to inherit_only ACEs. This * prevents inheritable permissions from being * altered along with the file mode. */ iflags |= ACE_INHERIT_ONLY_ACE; } /* * If this ACL has any inheritable ACEs, mark that in * the hints (which are later masked into the pflags) * so create knows to do inheritance. */ if (isdir && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if ((type != ALLOW && type != DENY) || (iflags & ACE_INHERIT_ONLY_ACE)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } } else { /* * Limit permissions granted by ACEs to be no greater * than permissions of the requested group mode. * Applies when the "aclmode" property is set to * "groupmask". */ if ((type == ALLOW) && trim) access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops.ace_size(acep); zacep = (void *)((uintptr_t)zacep + ace_size); new_count++; new_bytes += ace_size; } zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; zfs_acl_release_nodes(aclp); aclp->z_acl_count = new_count; aclp->z_acl_bytes = new_bytes; newnode->z_ace_count = new_count; newnode->z_size = new_bytes; list_insert_tail(&aclp->z_acl, newnode); } int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { int error = 0; mutex_enter(&zp->z_acl_lock); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); else error = zfs_acl_node_read(zp, aclp, B_TRUE); if (error == 0) { (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } mutex_exit(&zp->z_acl_lock); return (error); } /* * Should ACE be inherited? */ static int zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) { int iflags = (acep_flags & 0xf); if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) return (1); else if (iflags & ACE_FILE_INHERIT_ACE) return (!((vtype == VDIR) && (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); return (0); } /* * inherit inheritable ACEs from parent */ static zfs_acl_t * zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { void *pacep = NULL; void *acep; zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; uint16_t iflags, newflags, type; size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; uint_t aclinherit; boolean_t isdir = (vtype == VDIR); boolean_t isreg = (vtype == VREG); *need_chmod = B_TRUE; aclp = zfs_acl_alloc(paclp->z_version); aclinherit = zfsvfs->z_acl_inherit; if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) return (aclp); while (pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type)) { /* * don't inherit bogus ACEs */ if (!zfs_acl_valid_ace_type(type, iflags)) continue; /* * Check if ACE is inheritable by this vnode */ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || !zfs_ace_can_use(vtype, iflags)) continue; /* * If owner@, group@, or everyone@ inheritable * then zfs_acl_chmod() isn't needed. */ if ((aclinherit == ZFS_ACL_PASSTHROUGH || aclinherit == ZFS_ACL_PASSTHROUGH_X) && ((iflags & (ACE_OWNER|ACE_EVERYONE)) || ((iflags & OWNING_GROUP) == OWNING_GROUP)) && (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) *need_chmod = B_FALSE; /* * Strip inherited execute permission from file if * not in mode */ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { access_mask &= ~ACE_EXECUTE; } /* * Strip write_acl and write_owner from permissions * when inheriting an ACE */ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { access_mask &= ~RESTRICTED_CLEAR; } ace_size = aclp->z_ops.ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; zfs_set_ace(aclp, acep, access_mask, type, who, iflags|ACE_INHERITED_ACE); /* * Copy special opaque data if any */ if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) { VERIFY((data2sz = aclp->z_ops.ace_data(acep, &data2)) == data1sz); bcopy(data1, data2, data2sz); } aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops.ace_flags_get(acep); /* * If ACE is not to be inherited further, or if the vnode is * not a directory, remove all inheritance flags */ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); continue; } /* * This directory has an inheritable ACE */ aclp->z_hints |= ZFS_INHERIT_ACE; /* * If only FILE_INHERIT is set then turn on * inherit_only */ if ((iflags & (ACE_FILE_INHERIT_ACE | ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } else { newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } } return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) { int error; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; gid_t gid; boolean_t need_chmod = B_TRUE; boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; if ((flag & IS_ROOT_NODE) == 0) ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); else ASSERT(dzp->z_vnode == NULL); bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); if (vsecp) if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); /* * Determine uid and gid. */ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &acl_ids->z_fuidp); acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, cr, &acl_ids->z_fuidp); acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; if (acl_ids->z_fgid != dzp->z_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { #ifndef __FreeBSD_kernel__ if (dzp->z_mode & S_ISGID) { #endif char *domain; uint32_t rid; acl_ids->z_fgid = dzp->z_gid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); if (zfsvfs->z_use_fuids && IS_EPHEMERAL(acl_ids->z_fgid)) { domain = zfs_fuid_idx_domain( &zfsvfs->z_fuid_idx, FUID_INDEX(acl_ids->z_fgid)); rid = FUID_RID(acl_ids->z_fgid); zfs_fuid_node_add(&acl_ids->z_fuidp, domain, rid, FUID_INDEX(acl_ids->z_fgid), acl_ids->z_fgid, ZFS_GROUP); } #ifndef __FreeBSD_kernel__ } else { acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, ZFS_GROUP, cr, &acl_ids->z_fuidp); gid = crgetgid(cr); } #endif } } /* * If we're creating a directory, and the parent directory has the * set-GID bit set, set in on the new directory. * Otherwise, if the user is neither privileged nor a member of the * file's new group, clear the file's set-GID bit. */ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (vap->va_type == VDIR)) { acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) acl_ids->z_mode &= ~S_ISGID; } if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_acl_lock); if (need_chmod) { if (vap->va_type == VDIR) acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, trim, acl_ids->z_aclp); } } if (inherited || vsecp) { acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, acl_ids->z_fuid, acl_ids->z_fgid); if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } return (0); } /* * Free ACL and fuid_infop, but not the acl_ids structure */ void zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) { if (acl_ids->z_aclp) zfs_acl_free(acl_ids->z_aclp); if (acl_ids->z_fuidp) zfs_fuid_info_free(acl_ids->z_fuidp); acl_ids->z_aclp = NULL; acl_ids->z_fuidp = NULL; } boolean_t zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) { return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; ulong_t mask; int error; - int count = 0; + int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); if (mask == 0) return (SET_ERROR(ENOSYS)); if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) return (error); mutex_enter(&zp->z_acl_lock); ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } /* * Scan ACL to determine number of ACEs */ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; uint16_t type, iflags; while (zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: largeace++; continue; default: count++; } } vsecp->vsa_aclcnt = count; } else count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; } if (mask & VSA_ACE) { size_t aclsz; aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); vsecp->vsa_aclentsz = aclsz; if (aclp->z_version == ZFS_ACL_VERSION_FUID) zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { zfs_acl_node_t *aclnode; void *start = vsecp->vsa_aclentp; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { bcopy(aclnode->z_acldata, start, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == aclp->z_acl_bytes); } } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } mutex_exit(&zp->z_acl_lock); return (0); } int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; int aclcnt = vsecp->vsa_aclcnt; int error; if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) return (SET_ERROR(EINVAL)); aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); aclp->z_hints = 0; aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } else { if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } aclp->z_acl_bytes = aclnode->z_size; aclnode->z_ace_count = aclcnt; aclp->z_acl_count = aclcnt; list_insert_head(&aclp->z_acl, aclnode); /* * If flags are being set then add them to z_hints */ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { if (vsecp->vsa_aclflags & ACL_PROTECTED) aclp->z_hints |= ZFS_ACL_PROTECTED; if (vsecp->vsa_aclflags & ACL_DEFAULTED) aclp->z_hints |= ZFS_ACL_DEFAULTED; if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; } *zaclp = aclp; return (0); } /* * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; uint64_t acl_obj; ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (mask == 0) return (SET_ERROR(ENOSYS)); if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, &aclp); if (error) return (error); /* * If ACL wide flags aren't being set then preserve any * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { aclp->z_hints |= (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: mutex_enter(&zp->z_acl_lock); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); /* * If old version and ACL won't fit in bonus and we aren't * upgrading then take out necessary DMU holds */ if ((acl_obj = zfs_external_acl(zp)) != 0) { if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_acl_free(aclp); return (error); } error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); mutex_exit(&zp->z_acl_lock); return (error); } /* * Check accesses of interest (AoI) against attributes of the dataset * such as read-only. Returns zero if no AoI conflict with dataset * attributes, otherwise an appropriate errno is returned. */ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && (!IS_DEVVP(ZTOV(zp)) || (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { return (SET_ERROR(EROFS)); } /* * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common(). */ if ((v4_mode & WRITE_MASK_DATA) && (zp->z_pflags & ZFS_IMMUTABLE)) { return (SET_ERROR(EPERM)); } #ifdef illumos if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && (zp->z_pflags & ZFS_NOUNLINK)) { return (SET_ERROR(EPERM)); } #else /* * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK * (sunlnk) is set. We just don't allow directory removal, which is * handled in zfs_zaccess_delete(). */ if ((v4_mode & ACE_DELETE) && (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } #endif if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (SET_ERROR(EACCES)); } return (0); } /* * The primary usage of this function is to loop through all of the * ACEs in the znode, determining what accesses of interest (AoI) to * the caller are allowed or denied. The AoI are expressed as bits in * the working_mode parameter. As each ACE is processed, bits covered * by that ACE are removed from the working_mode. This removal * facilitates two things. The first is that when the working mode is * empty (= 0), we know we've looked at all the AoI. The second is * that the ACE interpretation rules don't allow a later ACE to undo * something granted or denied by an earlier ACE. Removing the * discovered access or denial enforces this rule. At the end of * processing the ACEs, all AoI that were found to be denied are * placed into the working_mode, giving the caller a mask of denied * accesses. Returns: * 0 if all AoI granted * EACCESS if the denied mask is non-zero * other error if abnormal failure (e.g., IO error) * * A secondary usage of the function is to determine if any of the * AoI are granted. If an ACE grants any access in * the working_mode, we immediately short circuit out of the function. * This mode is chosen by setting anyaccess to B_TRUE. The * working_mode is not a denied access mask upon exit if the function * is used in this manner. */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, boolean_t anyaccess, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); - uint64_t who; + uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; uid_t gowner; uid_t fowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } ASSERT(zp->z_acl_cached); while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) continue; /* Skip ACE if it does not affect any AoI */ mask_matched = (access_mask & *working_mode); if (!mask_matched) continue; entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; switch (entry_type) { case ACE_OWNER: if (uid == fowner) checkit = B_TRUE; break; case OWNING_GROUP: who = gowner; /*FALLTHROUGH*/ case ACE_IDENTIFIER_GROUP: checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: checkit = B_TRUE; break; /* USER Entry */ default: if (entry_type == 0) { uid_t newid; newid = zfs_fuid_map_id(zfsvfs, who, cr, ZFS_ACE_USER); if (newid != IDMAP_WK_CREATOR_OWNER_UID && uid == newid) checkit = B_TRUE; break; } else { mutex_exit(&zp->z_acl_lock); return (SET_ERROR(EIO)); } } if (checkit) { if (type == DENY) { DTRACE_PROBE3(zfs__ace__denies, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); deny_mask |= mask_matched; } else { DTRACE_PROBE3(zfs__ace__allows, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); return (0); } } *working_mode &= ~mask_matched; } /* Are we done? */ if (*working_mode == 0) break; } mutex_exit(&zp->z_acl_lock); /* Put the found 'denies' back on the working mode */ if (deny_mask) { *working_mode |= deny_mask; return (SET_ERROR(EACCES)); } else if (*working_mode) { return (-1); } return (0); } /* * Return true if any access whatsoever granted, we don't actually * care what access is granted. */ boolean_t zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { uid_t owner; owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); } return (B_TRUE); } static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; int err; *working_mode = v4_mode; *check_privs = B_TRUE; /* * Short circuit empty requests */ if (v4_mode == 0 || zfsvfs->z_replay) { *working_mode = 0; return (0); } if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { *check_privs = B_FALSE; return (err); } /* * The caller requested that the ACL check be skipped. This * would only happen if the caller checked VOP_ACCESS() with a * 32 bit ACE mask and already had the appropriate permissions. */ if (skipaclchk) { *working_mode = 0; return (0); } /* * Note: ZFS_READONLY represents the "DOS R/O" attribute. * When that flag is set, we should behave as if write access * were not granted by anything in the ACL. In particular: * We _must_ allow writes after opening the file r/w, then * setting the DOS R/O attribute, and writing some more. * (Similar to how you can write after fchmod(fd, 0444).) * * Therefore ZFS_READONLY is ignored in the dataset check * above, and checked here as if part of the ACL check. * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && (ZTOV(zp)->v_type != VDIR) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, check_privs, B_FALSE, cr)); } int zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) { boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; uid_t uid = crgetuid(cr); int error; if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); is_attr = ((zdp->z_pflags & ZFS_XATTR) && (ZTOV(zdp)->v_type == VDIR)); if (is_attr) goto slow; mutex_enter(&zdp->z_acl_lock); if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } if (uid == zdp->z_uid) { owner = B_TRUE; if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (groupmember(zdp->z_gid, cr)) { groupmbr = B_TRUE; if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (!owner && !groupmbr) { if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } } mutex_exit(&zdp->z_acl_lock); slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); ZFS_ENTER(zdp->z_zfsvfs); error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); ZFS_EXIT(zdp->z_zfsvfs); return (error); } /* * Determine whether Access should be granted/denied. * * The least priv subsystem is always consulted as a basic privilege * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) { uint32_t working_mode; int error; int is_attr; - boolean_t check_privs; + boolean_t check_privs; znode_t *xzp; - znode_t *check_zp = zp; + znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); #ifdef __FreeBSD_kernel__ /* * In FreeBSD, we don't care about permissions of individual ADS. * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. */ if (zp->z_pflags & ZFS_XATTR) return (0); #else /* * If attribute then validate against base file */ if (is_attr) { uint64_t parent; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), &parent, sizeof (parent))) != 0) return (error); if ((error = zfs_zget(zp->z_zfsvfs, parent, &xzp)) != 0) { return (error); } check_zp = xzp; /* * fixup mode to map to xattr perms */ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); mode |= ACE_WRITE_NAMED_ATTRS; } if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { mode &= ~(ACE_READ_DATA|ACE_EXECUTE); mode |= ACE_READ_NAMED_ATTRS; } } #endif owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC * in needed_bits. Map the bits mapped by working_mode (currently * missing) in missing_bits. * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), * needed_bits. */ needed_bits = 0; working_mode = mode; if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && owner == crgetuid(cr)) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= VREAD; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= VWRITE; if (working_mode & ACE_EXECUTE) needed_bits |= VEXEC; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) VN_RELE(ZTOV(xzp)); return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits)); } if (error && !check_privs) { if (is_attr) VN_RELE(ZTOV(xzp)); return (error); } if (error && (flags & V_APPEND)) { error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); } if (error && check_privs) { mode_t checkmode = 0; /* * First check for implicit owner permission on * read_acl/read_attributes */ error = 0; ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && owner == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= VREAD; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= VWRITE; if (working_mode & ACE_EXECUTE) checkmode |= VEXEC; error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(ZTOV(check_zp), cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); } if (error == 0) { /* * See if any bits other than those already checked * for are still present. If so then return EACCES */ if (working_mode & ~(ZFS_CHECKED_MASKS)) { error = SET_ERROR(EACCES); } } } else if (error == 0) { error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits); } if (is_attr) VN_RELE(ZTOV(xzp)); return (error); } /* * Translate traditional unix VREAD/VWRITE/VEXEC mode into * native ACL format and call zfs_zaccess() */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); } /* * Access function for secpolicy_vnode_setattr */ int zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); } static int zfs_delete_final_check(znode_t *zp, znode_t *dzp, mode_t available_perms, cred_t *cr) { int error; uid_t downer; downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); error = secpolicy_vnode_access2(cr, ZTOV(dzp), downer, available_perms, VWRITE|VEXEC); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); return (error); } /* * Determine whether Access should be granted/deny, without * consulting least priv subsystem. * * The following chart is the recommended NFSv4 enforcement for * ability to delete an object. * * ------------------------------------------------------- * | Parent Dir | Target Object Permissions | * | permissions | | * ------------------------------------------------------- * | | ACL Allows | ACL Denies| Delete | * | | Delete | Delete | unspecified| * ------------------------------------------------------- * | ACL Allows | Permit | Permit | Permit | * | DELETE_CHILD | | * ------------------------------------------------------- * | ACL Denies | Permit | Deny | Deny | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL specifies | | | | * | only allow | Permit | Permit | Permit | * | write and | | | | * | execute | | | | * ------------------------------------------------------- * | ACL denies | | | | * | write and | Permit | Deny | Deny | * | execute | | | | * ------------------------------------------------------- * ^ * | * No search privilege, can't even look up file? * */ int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) { uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; mode_t available_perms; boolean_t dzpcheck_privs = B_TRUE; boolean_t zpcheck_privs = B_TRUE; /* * We want specific DELETE permissions to * take precedence over WRITE/EXECUTE. We don't * want an ACL such as this to mess us up. * user:joe:write_data:deny,user:joe:delete:allow * * However, deny permissions may ultimately be overridden * by secpolicy_vnode_access(). * * We will ask for all of the necessary permissions and then * look at the working modes from the directory and target object * to determine what was found. */ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (SET_ERROR(EPERM)); /* * First row * If the directory permissions allow the delete, we are done. */ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) return (0); /* * If target object has delete permission then we are done */ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, &zpcheck_privs, B_FALSE, cr)) == 0) return (0); ASSERT(dzp_error && zp_error); if (!dzpcheck_privs) return (dzp_error); if (!zpcheck_privs) return (zp_error); /* * Second row * * If directory returns EACCES then delete_child was denied * due to deny delete_child. In this case send the request through * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() * since that *could* allow the delete based on write/execute permission * and we want delete permissions to override write/execute. */ if (dzp_error == EACCES) return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */ /* * Third Row * only need to see if we have write/execute on directory. */ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); if (dzp_error != 0 && !dzpcheck_privs) return (dzp_error); /* * Fourth row */ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; return (zfs_delete_final_check(zp, dzp, available_perms, cr)); } int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, znode_t *tzp, cred_t *cr) { int add_perm; int error; if (szp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); add_perm = (ZTOV(szp)->v_type == VDIR) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; /* * Rename permissions are combination of delete permission + * add file/subdir permission. * * BSD operating systems also require write permission * on the directory being moved from one parent directory * to another. */ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)) return (error); } /* * first make sure we do the delete portion. * * If that succeeds then check for add_file/add_subdir permissions */ if (error = zfs_zaccess_delete(sdzp, szp, cr)) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp) { if (error = zfs_zaccess_delete(tdzp, tzp, cr)) return (error); } /* * Now check for add permissions */ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); return (error); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c (revision 351077) @@ -1,1069 +1,1070 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Functions to replay ZFS intent log (ZIL) records * The functions are called through a function vector (zfs_replay_vector) * which is indexed by the transaction type. */ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { VATTR_NULL(vap); vap->va_mask = (uint_t)mask; if (mask & AT_TYPE) vap->va_type = IFTOVT(mode); if (mask & AT_MODE) vap->va_mode = mode & MODEMASK; if (mask & AT_UID) vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; if (mask & AT_GID) vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } /* ARGSUSED */ static int zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) { return (SET_ERROR(ENOTSUP)); } static void zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) { xoptattr_t *xoap = NULL; uint64_t *attrs; uint64_t *crtime; uint32_t *bitmap; void *scanstamp; int i; xvap->xva_vattr.va_mask |= AT_XVATTR; if ((xoap = xva_getxoptattr(xvap)) == NULL) { xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ return; } ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); bitmap = &lrattr->lr_attr_bitmap; for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) xvap->xva_reqattrmap[i] = *bitmap; attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); crtime = attrs + 1; scanstamp = (caddr_t)(crtime + 2); if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); if (XVA_ISSET_REQ(xvap, XAT_READONLY)) xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) xoap->xoa_av_quarantined = ((*attrs & XAT0_AV_QUARANTINED) != 0); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); } static int zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) { uint64_t uid_idx; uint64_t gid_idx; int domcnt = 0; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); if (uid_idx) domcnt++; if (gid_idx > 0 && gid_idx != uid_idx) domcnt++; return (domcnt); } static void * zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, int domcnt) { int i; for (i = 0; i != domcnt; i++) { fuid_infop->z_domain_table[i] = start; start = (caddr_t)start + strlen(start) + 1; } return (start); } /* * Set the uid/gid in the fuid_info structure. */ static void zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) { /* * If owner or group are log specific FUIDs then slurp up * domain information and build zfs_fuid_info_t */ if (IS_EPHEMERAL(uid)) fuid_infop->z_fuid_owner = uid; if (IS_EPHEMERAL(gid)) fuid_infop->z_fuid_group = gid; } /* * Load fuid domains into fuid_info_t */ static zfs_fuid_info_t * zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) { int domcnt; zfs_fuid_info_t *fuid_infop; fuid_infop = zfs_fuid_info_alloc(); domcnt = zfs_replay_domain_cnt(uid, gid); if (domcnt == 0) return (fuid_infop); fuid_infop->z_domain_table = kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); zfs_replay_fuid_ugid(fuid_infop, uid, gid); fuid_infop->z_domain_cnt = domcnt; *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); return (fuid_infop); } /* * load zfs_fuid_t's and fuid_domains into fuid_info_t */ static zfs_fuid_info_t * zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, uint64_t gid) { uint64_t *log_fuid = (uint64_t *)start; zfs_fuid_info_t *fuid_infop; int i; fuid_infop = zfs_fuid_info_alloc(); fuid_infop->z_domain_cnt = domcnt; fuid_infop->z_domain_table = kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); for (i = 0; i != idcnt; i++) { zfs_fuid_t *zfuid; zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); zfuid->z_logfuid = *log_fuid; zfuid->z_id = -1; zfuid->z_domidx = 0; list_insert_tail(&fuid_infop->z_fuids, zfuid); log_fuid++; } zfs_replay_fuid_ugid(fuid_infop, uid, gid); *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); return (fuid_infop); } static void zfs_replay_swap_attrs(lr_attr_t *lrattr) { /* swap the lr_attr structure */ byteswap_uint32_array(lrattr, sizeof (*lrattr)); /* swap the bitmap */ byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * sizeof (uint32_t)); /* swap the attributes, create time + 64 bit word for attributes */ byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); } /* * Replay file create with optional ACL, xvattr information as well * as option FUID information. */ static int zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_create_t *lracl = arg2; char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; vnode_t *vp = NULL; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; lr_attr_t *lrattr; void *aclstart; void *fuidstart; size_t xvatlen = 0; uint64_t txtype; uint64_t objid; uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); zfs_replay_swap_attrs(lrattr); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } aclstart = (caddr_t)(lracl + 1) + xvatlen; zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); /* swap fuids */ if (lracl->lr_fuidcnt) { byteswap_uint64_array((caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), lracl->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; - + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode size. The generic * zfs_create() has no concept of these attributes, so we smuggle * the values inside the vattr's otherwise unused va_ctime, * va_nblocks, and va_fsid fields. + */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); xva.xva_vattr.va_mask |= AT_XVATTR; zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #ifdef TODO error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, 0, 0, &vp, kcred, vflg, NULL, &vsec); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #ifdef TODO error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, &vp, kcred, NULL, vflg, &vsec); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif break; default: error = SET_ERROR(ENOTSUP); } bail: if (error == 0 && vp != NULL) VN_RELE(vp); VN_RELE(ZTOV(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); } static int zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_create_t *lr = arg2; char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; vnode_t *vp = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); lr_attr_t *lrattr; void *start; size_t xvatlen; uint64_t txtype; struct componentname cn; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid); int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode slot count. The * generic zfs_create() has no concept of these attributes, so - * we smuggle the values inside the vattr's otherwise unused - * va_ctime, va_nblocks and va_fsid fields. + * we smuggle the values inside * the vattr's otherwise unused + * va_ctime, va_nblocks, and va_nlink fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zfsvfs->z_os, objid, NULL); if (error != ENOENT) goto out; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; /* * Symlinks don't have fuid info, and CIFS never creates * symlinks. * * The _ATTR versions will grab the fuid info in their subcases. */ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); } cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_CREATE: if (name == NULL) name = (char *)start; cn.cn_nameptr = name; error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); cn.cn_nameptr = name; error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; cn.cn_nameptr = name; error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); break; default: error = SET_ERROR(ENOTSUP); } VOP_UNLOCK(ZTOV(dzp), 0); out: if (error == 0 && vp != NULL) VN_URELE(vp); VN_RELE(ZTOV(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); } static int zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_remove_t *lr = arg2; char *name = (char *)(lr + 1); /* name follows lr_remove_t */ znode_t *dzp; struct componentname cn; vnode_t *vp; int error; int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; cn.cn_nameptr = name; cn.cn_namelen = strlen(name); cn.cn_nameiop = DELETE; cn.cn_flags = ISLASTCN | SAVENAME; cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cn.cn_cred = kcred; cn.cn_thread = curthread; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); if (error != 0) { VOP_UNLOCK(ZTOV(dzp), 0); goto fail; } switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/); break; case TX_RMDIR: error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/); break; default: error = SET_ERROR(ENOTSUP); } vput(vp); VOP_UNLOCK(ZTOV(dzp), 0); fail: VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_link_t *lr = arg2; char *name = (char *)(lr + 1); /* name follows lr_link_t */ znode_t *dzp, *zp; struct componentname cn; int error; int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { VN_RELE(ZTOV(dzp)); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); VOP_UNLOCK(ZTOV(zp), 0); VOP_UNLOCK(ZTOV(dzp), 0); VN_RELE(ZTOV(zp)); VN_RELE(ZTOV(dzp)); return (error); } static int zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; struct componentname scn, tcn; vnode_t *svp, *tvp; kthread_t *td = curthread; int error; int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { VN_RELE(ZTOV(sdzp)); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; svp = tvp = NULL; scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; scn.cn_flags = ISLASTCN | SAVENAME; scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; scn.cn_cred = kcred; scn.cn_thread = td; vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); VOP_UNLOCK(ZTOV(sdzp), 0); if (error != 0) goto fail; VOP_UNLOCK(svp, 0); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; tcn.cn_flags = ISLASTCN | SAVENAME; tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; tcn.cn_cred = kcred; tcn.cn_thread = td; vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK(ZTOV(tdzp), 0); goto fail; } error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); return (error); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); VN_RELE(ZTOV(tdzp)); VN_RELE(ZTOV(sdzp)); return (error); } static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_write_t *lr = arg2; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; ssize_t resid; uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } offset = lr->lr_offset; length = lr->lr_length; eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zfsvfs->z_replay_eof */ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); VN_RELE(ZTOV(zp)); zfsvfs->z_replay_eof = 0; /* safety */ return (error); } /* * TX_WRITE2 are only generated when dmu_sync() returns EALREADY * meaning the pool block is already being synced. So now that we always write * out full blocks, all we have to do is expand the eof if * the file is grown. */ static int zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_write_t *lr = arg2; znode_t *zp; int error; uint64_t end; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); top: end = lr->lr_offset + lr->lr_length; if (end > zp->z_size) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); zp->z_size = end; dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { VN_RELE(ZTOV(zp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); return (error); } (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); /* Ensure the replayed seq is updated */ (void) zil_replaying(zfsvfs->z_log, tx); dmu_tx_commit(tx); } VN_RELE(ZTOV(zp)); return (error); } static int zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { #ifdef illumos zfsvfs_t *zfsvfs = arg1; lr_truncate_t *lr = arg2; znode_t *zp; flock64_t fl; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); bzero(&fl, sizeof (fl)); fl.l_type = F_WRLCK; fl.l_whence = 0; fl.l_start = lr->lr_offset; fl.l_len = lr->lr_length; error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, lr->lr_offset, kcred, NULL); VN_RELE(ZTOV(zp)); return (error); #else ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); return (EOPNOTSUPP); #endif } static int zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_setattr_t *lr = arg2; znode_t *zp; xvattr_t xva; vattr_t *vap = &xva.xva_vattr; vnode_t *vp; int error; void *start; xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if ((lr->lr_mask & AT_XVATTR) && zfsvfs->z_version >= ZPL_VERSION_INITIAL) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); vap->va_size = lr->lr_size; ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); /* * Fill in xvattr_t portions if necessary. */ start = (lr_setattr_t *)(lr + 1); if (vap->va_mask & AT_XVATTR) { zfs_replay_xvattr((lr_attr_t *)start, &xva); start = (caddr_t)start + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); } else xva.xva_vattr.va_mask &= ~AT_XVATTR; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, vap, kcred); VOP_UNLOCK(vp, 0); zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; VN_RELE(vp); return (error); } extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct); static int zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_v0_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ vsecattr_t vsa; vnode_t *vp; znode_t *zp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; vsa.vsa_aclflags = 0; vsa.vsa_aclentp = ace; vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); VOP_UNLOCK(vp, 0); VN_RELE(vp); return (error); } /* * Replaying ACLs is complicated by FUID support. * The log record may contain some optional data * to be used for replaying FUID's. These pieces * are the actual FUIDs that were created initially. * The FUID table index may no longer be valid and * during zfs_create() a new index may be assigned. * Because of this the log will contain the original * doman+rid in order to create a new FUID. * * The individual ACEs may contain an ephemeral uid/gid which is no * longer valid and will need to be replaced with an actual FUID. * */ static int zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); vsecattr_t vsa; znode_t *zp; vnode_t *vp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); if (lr->lr_fuidcnt) { byteswap_uint64_array((caddr_t)ace + ZIL_ACE_LENGTH(lr->lr_acl_bytes), lr->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; vsa.vsa_aclentsz = lr->lr_acl_bytes; vsa.vsa_aclflags = lr->lr_acl_flags; if (lr->lr_fuidcnt) { void *fuidstart = (caddr_t)ace + ZIL_ACE_LENGTH(lr->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, &fuidstart, lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); } vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); VOP_UNLOCK(vp, 0); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; VN_RELE(vp); return (error); } /* * Callback vectors for replaying records */ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_error, /* 0 no such transaction type */ zfs_replay_create, /* TX_CREATE */ zfs_replay_create, /* TX_MKDIR */ zfs_replay_create, /* TX_MKXATTR */ zfs_replay_create, /* TX_SYMLINK */ zfs_replay_remove, /* TX_REMOVE */ zfs_replay_remove, /* TX_RMDIR */ zfs_replay_link, /* TX_LINK */ zfs_replay_rename, /* TX_RENAME */ zfs_replay_write, /* TX_WRITE */ zfs_replay_truncate, /* TX_TRUNCATE */ zfs_replay_setattr, /* TX_SETATTR */ zfs_replay_acl_v0, /* TX_ACL_V0 */ zfs_replay_acl, /* TX_ACL */ zfs_replay_create_acl, /* TX_CREATE_ACL */ zfs_replay_create, /* TX_CREATE_ATTR */ zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 351077) @@ -1,2281 +1,2289 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)"); /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL /* * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to * be freed before it can be safely accessed. */ krwlock_t zfsvfs_lock; static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ static void znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) { /* * We should never drop all dbuf refs without first clearing * the eviction callback. */ panic("evicting znode %p\n", user_ptr); } extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); zp->z_acl_cached = NULL; zp->z_vnode = NULL; zp->z_moved = 0; return (0); } /*ARGSUSED*/ static void zfs_znode_cache_destructor(void *buf, void *arg) { znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT3P(zp->z_vnode, ==, NULL); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_acl_lock); avl_destroy(&zp->z_range_avl); mutex_destroy(&zp->z_range_lock); ASSERT(zp->z_acl_cached == NULL); } #ifdef ZNODE_STATS static struct { uint64_t zms_zfsvfs_invalid; uint64_t zms_zfsvfs_recheck1; uint64_t zms_zfsvfs_unmounted; uint64_t zms_zfsvfs_recheck2; uint64_t zms_obj_held; uint64_t zms_vnode_locked; uint64_t zms_not_only_dnlc; } znode_move_stats; #endif /* ZNODE_STATS */ #ifdef illumos static void zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) { vnode_t *vp; /* Copy fields. */ nzp->z_zfsvfs = ozp->z_zfsvfs; /* Swap vnodes. */ vp = nzp->z_vnode; nzp->z_vnode = ozp->z_vnode; ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ ZTOV(ozp)->v_data = ozp; ZTOV(nzp)->v_data = nzp; nzp->z_id = ozp->z_id; ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); nzp->z_unlinked = ozp->z_unlinked; nzp->z_atime_dirty = ozp->z_atime_dirty; nzp->z_zn_prefetch = ozp->z_zn_prefetch; nzp->z_blksz = ozp->z_blksz; nzp->z_seq = ozp->z_seq; nzp->z_mapcnt = ozp->z_mapcnt; nzp->z_gen = ozp->z_gen; nzp->z_sync_cnt = ozp->z_sync_cnt; nzp->z_is_sa = ozp->z_is_sa; nzp->z_sa_hdl = ozp->z_sa_hdl; bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); nzp->z_links = ozp->z_links; nzp->z_size = ozp->z_size; nzp->z_pflags = ozp->z_pflags; nzp->z_uid = ozp->z_uid; nzp->z_gid = ozp->z_gid; nzp->z_mode = ozp->z_mode; /* * Since this is just an idle znode and kmem is already dealing with * memory pressure, release any cached ACL. */ if (ozp->z_acl_cached) { zfs_acl_free(ozp->z_acl_cached); ozp->z_acl_cached = NULL; } sa_set_userp(nzp->z_sa_hdl, nzp); /* * Invalidate the original znode by clearing fields that provide a * pointer back to the znode. Set the low bit of the vfs pointer to * ensure that zfs_znode_move() recognizes the znode as invalid in any * subsequent callback. */ ozp->z_sa_hdl = NULL; POINTER_INVALIDATE(&ozp->z_zfsvfs); /* * Mark the znode. */ nzp->z_moved = 1; ozp->z_moved = (uint8_t)-1; } /*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) { znode_t *ozp = buf, *nzp = newbuf; zfsvfs_t *zfsvfs; vnode_t *vp; /* * The znode is on the file system's list of known znodes if the vfs * pointer is valid. We set the low bit of the vfs pointer when freeing * the znode to invalidate it, and the memory patterns written by kmem * (baddcafe and deadbeef) set at least one of the two low bits. A newly * created znode sets the vfs pointer last of all to indicate that the * znode is known and in a valid state to be moved by this function. */ zfsvfs = ozp->z_zfsvfs; if (!POINTER_IS_VALID(zfsvfs)) { ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Close a small window in which it's possible that the filesystem could * be unmounted and freed, and zfsvfs, though valid in the previous * statement, could point to unrelated memory by the time we try to * prevent the filesystem from being unmounted. */ rw_enter(&zfsvfs_lock, RW_WRITER); if (zfsvfs != ozp->z_zfsvfs) { rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the znode is still valid, then so is the file system. We know that * no valid file system can be freed while we hold zfsvfs_lock, so we * can safely ensure that the filesystem is not and will not be * unmounted. The next statement is equivalent to ZFS_ENTER(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); if (zfsvfs->z_unmounted) { ZFS_EXIT(zfsvfs); rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } rw_exit(&zfsvfs_lock); mutex_enter(&zfsvfs->z_znodes_lock); /* * Recheck the vfs pointer in case the znode was removed just before * acquiring the lock. */ if (zfsvfs != ozp->z_zfsvfs) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold z_znodes_lock, the * znode cannot be freed and fields within the znode can be safely * accessed. Now, prevent a race with zfs_zget(). */ if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); return (KMEM_CBRC_LATER); } vp = ZTOV(ozp); if (mutex_tryenter(&vp->v_lock) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); return (KMEM_CBRC_LATER); } /* Only move znodes that are referenced _only_ by the DNLC. */ if (vp->v_count != 1 || !vn_in_dnlc(vp)) { mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); return (KMEM_CBRC_LATER); } /* * The znode is known and in a valid state to move. We're holding the * locks needed to execute the critical section. */ zfs_znode_move_impl(ozp, nzp); mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); list_link_replace(&ozp->z_link_node, &nzp->z_link_node); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); return (KMEM_CBRC_YES); } #endif /* illumos */ void zfs_znode_init(void) { /* * Initialize zcache */ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); kmem_cache_set_move(znode_cache, zfs_znode_move); } void zfs_znode_fini(void) { #ifdef illumos /* * Cleanup vfs & vnode ops */ zfs_remove_op_tables(); #endif /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; rw_destroy(&zfsvfs_lock); } #ifdef illumos struct vnodeops *zfs_dvnodeops; struct vnodeops *zfs_fvnodeops; struct vnodeops *zfs_symvnodeops; struct vnodeops *zfs_xdvnodeops; struct vnodeops *zfs_evnodeops; struct vnodeops *zfs_sharevnodeops; void zfs_remove_op_tables() { /* * Remove vfs ops */ ASSERT(zfsfstype); (void) vfs_freevfsops_by_type(zfsfstype); zfsfstype = 0; /* * Remove vnode ops */ if (zfs_dvnodeops) vn_freevnodeops(zfs_dvnodeops); if (zfs_fvnodeops) vn_freevnodeops(zfs_fvnodeops); if (zfs_symvnodeops) vn_freevnodeops(zfs_symvnodeops); if (zfs_xdvnodeops) vn_freevnodeops(zfs_xdvnodeops); if (zfs_evnodeops) vn_freevnodeops(zfs_evnodeops); if (zfs_sharevnodeops) vn_freevnodeops(zfs_sharevnodeops); zfs_dvnodeops = NULL; zfs_fvnodeops = NULL; zfs_symvnodeops = NULL; zfs_xdvnodeops = NULL; zfs_evnodeops = NULL; zfs_sharevnodeops = NULL; } extern const fs_operation_def_t zfs_dvnodeops_template[]; extern const fs_operation_def_t zfs_fvnodeops_template[]; extern const fs_operation_def_t zfs_xdvnodeops_template[]; extern const fs_operation_def_t zfs_symvnodeops_template[]; extern const fs_operation_def_t zfs_evnodeops_template[]; extern const fs_operation_def_t zfs_sharevnodeops_template[]; int zfs_create_op_tables() { int error; /* * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). * In this case we just return as the ops vectors are already set up. */ if (zfs_dvnodeops) return (0); error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, &zfs_dvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, &zfs_fvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, &zfs_symvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, &zfs_xdvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, &zfs_evnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, &zfs_sharevnodeops); return (error); } #endif /* illumos */ int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); kmem_cache_free(znode_cache, sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and intialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); KASSERT(curthread->td_vp_reserv > 0, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { kmem_cache_free(znode_cache, zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_moved = 0; /* * Defer setting z_zfsvfs until the znode is ready to be a candidate for * the zfs_znode_move() callback. */ zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; vp = ZTOV(zp); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; kmem_cache_free(znode_cache, zp); return (NULL); } zp->z_mode = mode; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; #ifdef illumos case VBLK: case VCHR: { uint64_t rdev; VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)) == 0); vp->v_rdev = zfs_cmpldev(rdev); } break; #endif case VFIFO: #ifdef illumos case VSOCK: case VDOOR: #endif vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT(zp->z_uid == 0 && zp->z_gid == 0); vp->v_op = &zfs_shareops; } break; #ifdef illumos case VLNK: vn_setops(vp, zfs_symvnodeops); break; default: vn_setops(vp, zfs_evnodeops); break; #endif } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); membar_producer(); /* * Everything else must be valid before assigning z_zfsvfs makes the * znode eligible for zfs_znode_move(). */ zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); /* * Acquire vnode lock before making it available to the world. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); #ifdef illumos VFS_HOLD(zfsvfs->z_vfs); #endif return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int err; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be deterimed when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT(*zpp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp; vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; struct thread *td; int locked; int err; td = curthread; getnewvnode_reserve(1); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); #ifdef __FreeBSD__ getnewvnode_drop_reserve(); #endif return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { vp = ZTOV(zp); /* * Don't let the vnode disappear after * ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); if (err) { getnewvnode_drop_reserve(); return (err); } locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if ((vp->v_iflag & VI_DOOMED) != 0 && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a refrence to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (err); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK(vp, 0); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; /* * Remove cached pages before reloading the znode, so that they are not * lingering after we run into any error. Ideally, we should vgone() * the vnode in case of error, but currently we cannot do that * because of the LOR between the vnode lock and z_teardown_lock. * So, instead, we have to "doom" the znode in the illumos style. */ vp = ZTOV(zp); vn_pages_remove(vp, 0, 0); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatical removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (zp->z_links == 0); if (zp->z_unlinked) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } zp->z_blksz = doi.doi_data_block_size; if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_sa_hdl == NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } kmem_cache_free(znode_cache, zp); #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #endif } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } #ifdef illumos /* * This is a dummy interface used when pvn_vplist_dirty() should *not* * be calling back into the fs for a putpage(). E.g.: when truncating * a file, the pages being "thrown away* don't need to be written out. */ /* ARGSUSED */ static int zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { ASSERT(0); return (0); } #endif /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; rl_t *rl; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_range_unlock(rl); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_range_unlock(rl); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; int error; /* * Lock the range being freed. */ rl = zfs_range_lock(zp, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_range_unlock(rl); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { /* * In FreeBSD we cannot free block in the middle of a file, * but only at the end of a file, so this code path should * never happen. */ vnode_pager_setsize(ZTOV(zp), off); } zfs_range_unlock(rl); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; rl_t *rl; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_range_unlock(rl); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_range_unlock(rl); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_range_unlock(rl); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } /* * Check for any locks in the region to be freed. */ if (MANDLOCK(vp, (mode_t)mode)) { uint64_t length = (len ? len : zp->z_size - off); if (error = chklock(vp, FWRITE, off, length, flag, NULL)) return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); + + /* + * Give dmu_object_alloc() a hint about where to start + * allocating new objects. Otherwise, since the metadnode's + * dnode_phys_t structure isn't initialized yet, dmu_object_next() + * would fail and we'd have to skip to the next dnode block. + */ + os->os_obj_next = moid + 1; /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT(error == 0); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } error = 0; for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if (prevdb) zfs_release_sa_handle(prevhdl, prevdb, FTAG); if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); bcopy(component, path, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t parent; int is_xattrdir; int err; /* Extended attributes should not be visible as regular files. */ if ((zp->z_pflags & ZFS_XATTR) != 0) return (SET_ERROR(EINVAL)); err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, &parent, &is_xattrdir); if (err != 0) return (err); ASSERT0(is_xattrdir); /* No name as this is a root object. */ if (parent == zp->z_id) return (SET_ERROR(EINVAL)); err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), buf); if (err != 0) return (err); err = zfs_zget(zfsvfs, parent, dzpp); return (err); } #endif /* _KERNEL */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 351076) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 351077) @@ -1,3439 +1,3437 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system * calls that change the file system. Each itx has enough information to * be able to replay them after a system crash, power loss, or * equivalent failure mode. These are stored in memory until either: * * 1. they are committed to the pool by the DMU transaction group * (txg), at which point they can be discarded; or * 2. they are committed to the on-disk ZIL for the dataset being * modified (e.g. due to an fsync, O_DSYNC, or other synchronous * requirement). * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first * instantianted (e.g. if the dataset is a normal fileystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * - * - a single, per-dataset, ZIL header; which points to a chain of - * - zero or more ZIL blocks; each of which contains - * - zero or more ZIL records + * - a single, per-dataset, ZIL header; which points to a chain of + * - zero or more ZIL blocks; each of which contains + * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and * the blocks are chained together, similarly to a singly linked list. * * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL * block in the chain, and the ZIL header points to the first block in * the chain. * * Note, there is not a fixed place in the pool to hold these ZIL * blocks; they are dynamically allocated and freed as needed from the * blocks available on the pool, though they can be preferentially * allocated from a dedicated "log" vdev. */ /* * This controls the amount of time that a ZIL block (lwb) will remain * "open" when it isn't "full", and it has a thread waiting for it to be * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ int zfs_commit_timeout_pct = 5; /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, &zil_replay_disable, 0, "Disable intent logging replay"); /* * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to * the disk(s) by the ZIL after an LWB write has completed. Setting this * will cause ZIL corruption on power loss if a volatile out-of-order * write cache is enabled. */ boolean_t zil_nocacheflush = B_FALSE; SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_nocacheflush, CTLFLAG_RWTUN, &zil_nocacheflush, 0, "Disable ZIL cache flush"); boolean_t zfs_trim_enabled = B_TRUE; SYSCTL_DECL(_vfs_zfs_trim); SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, "Enable ZFS TRIM"); /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ uint64_t zil_slog_bulk = 768 * 1024; SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = abuf->b_data; char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { char *lr = abuf->b_data; uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(zilc->zc_nused, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } arc_buf_destroy(abuf, &abuf); } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk; char *lrbuf, *lrp; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *end; if (blk_seq > claim_blk_seq) break; if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); if (error != 0) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) goto done; if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) goto done; ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } /* ARGSUSED */ static int zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { ASSERT(!BP_IS_HOLE(bp)); /* * As we call this function from the context of a rewind to a * checkpoint, each ZIL block whose txg is later than the txg * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ if (bp->blk_birth >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) return (0); zio_free(zilog->zl_spa, first_txg, bp); return (0); } /* ARGSUSED */ static int zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { return (0); } static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; if (lrc->lrc_txtype != TX_WRITE) return (0); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg && (error = zil_read_log_data(zilog, lr, NULL)) != 0) return (error); return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } /* ARGSUSED */ static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; /* * If we previously claimed it, we need to free it. */ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; return (AVL_CMP(v1, v2)); } static lwb_t * zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_slog = slog; lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_tx = NULL; lwb->lwb_issued_timestamp = 0; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_nused = sizeof (zil_chain_t); lwb->lwb_sz = BP_GET_LSIZE(bp); } else { lwb->lwb_nused = 0; lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); } mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); VERIFY(list_is_empty(&lwb->lwb_waiters)); return (lwb); } static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || lwb->lwb_state == LWB_STATE_FLUSH_DONE); /* * Clear the zilog's field to indicate this lwb is no longer * valid, and prevent use-after-free errors. */ if (zilog->zl_last_lwb_opened == lwb) zilog->zl_last_lwb_opened = NULL; kmem_cache_free(zil_lwb_cache, lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(spa_writeable(zilog->zl_spa)); if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); } } /* * Determine if the zil is dirty in the specified txg. Callers wanting to * ensure that the dirty state does not change must hold the itxg_lock for * the specified txg. Holding the lock will ensure that the zil cannot be * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ boolean_t zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) return (B_TRUE); return (B_FALSE); } /* * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; boolean_t slog = FALSE; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianess */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write block (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, &blk, slog, txg); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block poiner into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); return (lwb); } /* * In one tx, free all log blocks and clear the log header. If keep_first * is set, then we're replaying a log with no content. We want to keep the * first block, however, so that the first synchronous transaction doesn't * require a txg_wait_synced() in zil_create(). We don't need to * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. */ void zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return; tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); } int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; zilog_t *zilog; uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own_obj(dp, ds->ds_object, DMU_OST_ANY, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %llu, error %u", (unsigned long long)ds->ds_object, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); first_txg = spa_min_claim_txg(zilog->zl_spa); /* * If the spa_log_state is not set to be cleared, check whether * the current uberblock is a checkpoint one and if the current * header has been claimed before moving on. * * If the current uberblock is a checkpointed uberblock then * one of the following scenarios took place: * * 1] We are currently rewinding to the checkpoint of the pool. * 2] We crashed in the middle of a checkpoint rewind but we * did manage to write the checkpointed uberblock to the * vdev labels, so when we tried to import the pool again * the checkpointed uberblock was selected from the import * procedure. * * In both cases we want to zero out all the ZIL blocks, except * the ones that have been claimed at the time of the checkpoint * (their zh_claim_txg != 0). The reason is that these blocks * may be corrupted since we may have reused their locations on * disk after we took the checkpoint. * * We could try to set spa_log_state to SPA_LOG_CLEAR earlier * when we first figure out whether the current uberblock is * checkpointed or not. Unfortunately, that would discard all * the logs, including the ones that are claimed, and we would * leak space. */ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, zil_noop_log_record, tx, first_txg); } BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, FTAG); return (0); } /* * If we are not rewinding and opening the pool normally, then * the min_claim_txg should be equal to the first txg of the pool. */ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ /* ARGSUSED */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_from_ds(ds, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset %llu, error %d", (unsigned long long)ds->ds_object, error); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the * log as its content should have already been synced to the * pool. */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) return (0); /* * Check whether the current uberblock is checkpointed (e.g. * we are rewinding) and whether the current header has been * claimed or not. If it hasn't then skip verifying it. We * do this because its ZIL blocks may be part of the pool's * state before the rewind, which is no longer valid. */ zil_header_t *zh = zil_header_in_syncing_context(zilog); if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return (0); } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_min_claim_txg(os->os_spa)); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can * be skipped (and not committed to an lwb) for a variety of reasons, * one of them being that the itx was committed via spa_sync(), prior to * it being committed to an lwb; this can happen if a thread calling * zil_commit() is racing with spa_sync(). */ static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } /* * This function is used when the given waiter is to be linked into an * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. * At this point, the waiter will no longer be referenced by the itx, * and instead, will be referenced by the lwb. */ static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's * zl_lock, thus it must be held when calling this function. */ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3P(lwb, !=, NULL); ASSERT(lwb->lwb_state == LWB_STATE_OPENED || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE); list_insert_tail(&lwb->lwb_waiters, zcw); zcw->zcw_lwb = lwb; mutex_exit(&zcw->zcw_lock); } /* * This function is used when zio_alloc_zil() fails to allocate a ZIL * block, and the given waiter must be linked to the "nolwb waiters" * list inside of zil_process_commit_list(). */ static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); list_insert_tail(nolwb, zcw); mutex_exit(&zcw->zcw_lock); } void zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) { avl_tree_t *t = &lwb->lwb_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; if (zil_nocacheflush) return; mutex_enter(&lwb->lwb_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&lwb->lwb_vdev_lock); } static void zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) { avl_tree_t *src = &lwb->lwb_vdev_tree; avl_tree_t *dst = &nlwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does * not need the protection of lwb_vdev_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ mutex_enter(&nlwb->lwb_vdev_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. */ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { avl_index_t where; if (avl_find(dst, zv, &where) == NULL) { avl_insert(dst, zv, where); } else { kmem_free(zv, sizeof (*zv)); } } mutex_exit(&nlwb->lwb_vdev_lock); } void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); } /* * This function is a called after all vdevs associated with a given lwb * write have completed their DKIOCFLUSHWRITECACHE command; or as soon * as the lwb write completes, if "zil_nocacheflush" is set. Further, * all "previous" lwb's will have completed before this function is * called; i.e. this function is called for all previous lwbs before * it's called for "this" lwb (enforced via zio the dependencies * configured in zil_lwb_set_zio_dependency()). * * The intention is for this function to be called as soon as the * contents of an lwb are considered "stable" on disk, and will survive * any sudden loss of power. At this point, any threads waiting for the * lwb to reach this state are signalled, and the "waiter" structures * are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; dmu_tx_t *tx = lwb->lwb_tx; zil_commit_waiter_t *zcw; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); /* * Ensure the lwb buffer pointer is cleared before releasing the * txg. If we have had an allocation failure and the txg is * waiting to sync then we want zil_sync() to remove the lwb so * that it's not picked up as the next new one in * zil_process_commit_list(). zil_sync() will only remove the * lwb if lwb_buf is null. */ lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; ASSERT3U(lwb->lwb_issued_timestamp, >, 0); zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; lwb->lwb_root_zio = NULL; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number * for ztest. We only update this value when all the log * writes succeeded, because ztest wants to ASSERT that * it got the whole log chain. */ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); ASSERT(list_link_active(&zcw->zcw_node)); list_remove(&lwb->lwb_waiters, zcw); ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in * which we allocated the next block sync. */ dmu_tx_commit(tx); } /* * This is called when an lwb's write zio completes. The callback's * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved * in writing out this specific lwb's data, and in the case that cache * flushes have been deferred, vdevs involved in writing the data for * previous lwbs. The writes corresponding to all the vdevs in the * lwb_vdev_tree will have completed by the time this is called, due to * the zio dependencies configured in zil_lwb_set_zio_dependency(), * which takes deferred flushes into account. The lwb will be "done" * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio * completion callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(BP_GET_FILL(zio->io_bp) == 0); abd_put(zio->io_abd); mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_write_zio = NULL; nlwb = list_next(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) return; /* * If there was an IO error, we're not going to call zio_flush() * on these vdevs, so we simply empty the tree and free the * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; } /* * If this lwb does not have any threads waiting for it to * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE * command to the vdevs written to by "this" lwb, and instead * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE * command for those vdevs. Thus, we merge the vdev tree of * "this" lwb with the vdev tree of the "next" lwb in the list, * and assume the "next" lwb will handle flushing the vdevs (or * deferring the flush(s) again). * * This is a useful performance optimization, especially for * workloads with lots of async write activity and few sync * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; } while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(lwb->lwb_root_zio, vd); kmem_free(zv, sizeof (*zv)); } } static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_lock)); /* * The zilog's "zl_last_lwb_opened" field is used to build the * lwb/zio dependency chain, which is used to preserve the * ordering of lwb completions that is required by the semantics * of the ZIL. Each new lwb zio becomes a parent of the * "previous" lwb zio, such that the new lwb's zio cannot * complete until the "previous" lwb's zio completes. * * This is required by the semantics of zil_commit(); the commit * waiters attached to the lwbs will be woken in the lwb zio's * completion callback, so this zio dependency graph ensures the * waiters are woken in the correct order (the same order the * lwbs were created). */ if (last_lwb_opened != NULL && last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || last_lwb_opened->lwb_state == LWB_STATE_ISSUED || last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); zio_add_child(lwb->lwb_root_zio, last_lwb_opened->lwb_root_zio); /* * If the previous lwb's write hasn't already completed, * we also want to order the completion of the lwb write * zios (above, we only order the completion of the lwb * root zios). This is required because of how we can * defer the DKIOCFLUSHWRITECACHE commands for each lwb. * * When the DKIOCFLUSHWRITECACHE commands are defered, * the previous lwb will rely on this lwb to flush the * vdevs written to by that previous lwb. Thus, we need * to ensure this lwb doesn't issue the flush until * after the previous lwb's write completes. We ensure * this ordering by setting the zio parent/child * relationship here. * * Without this relationship on the lwb's write zio, * it's possible for this lwb's write to complete prior * to the previous lwb's write completing; and thus, the * vdevs for the previous lwb would be flushed prior to * that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion * handler, zil_lwb_write_done()). */ if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || last_lwb_opened->lwb_state == LWB_STATE_ISSUED); ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); zio_add_child(lwb->lwb_write_zio, last_lwb_opened->lwb_write_zio); } } } /* * This function's purpose is to "open" an lwb such that it is ready to * accept new itxs being committed to it. To do this, the lwb's zio * structures are created, and linked to the lwb. This function is * idempotent; if the passed in lwb has already been opened, this * function is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { zbookmark_phys_t zb; zio_priority_t prio; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); if (lwb->lwb_root_zio == NULL) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk)); if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; lwb->lwb_root_zio = zio_root(zilog->zl_spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); ASSERT3P(lwb->lwb_root_zio, !=, NULL); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); ASSERT3P(lwb->lwb_write_zio, !=, NULL); lwb->lwb_state = LWB_STATE_OPENED; mutex_enter(&zilog->zl_lock); zil_lwb_set_zio_dependency(zilog, lwb); zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } ASSERT3P(lwb->lwb_root_zio, !=, NULL); ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ 8192+4096, /* data base */ - 32*1024 + 4096, /* NFS writes */ + 32*1024 + 4096, /* NFS writes */ UINT64_MAX }; /* * Start a log block write and advance to the next log block. * Calls are serialized. */ static lwb_t * zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; uint64_t zil_blksz, wsz; int i, error; boolean_t slog; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb->lwb_root_zio, !=, NULL); ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; bp = &zilc->zc_next_blk; } else { zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); bp = &zilc->zc_next_blk; } ASSERT(lwb->lwb_nused <= lwb->lwb_sz); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). * We dirty the dataset to ensure that zil_sync() will be called * to clean up in the event of allocation failure or I/O failure. */ tx = dmu_tx_create(zilog->zl_os); /* * Since we are not going to create any new dirty data, and we * can even help with clearing the existing dirty data, we * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); lwb->lwb_tx = tx; /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i]; i++) continue; zil_blksz = zil_block_buckets[i]; if (zil_blksz == UINT64_MAX) zil_blksz = SPA_OLD_MAXBLOCKSIZE; zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, txg, bp, &lwb->lwb_blk, zil_blksz, &slog); if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write block (lwb). */ nlwb = zil_alloc_lwb(zilog, bp, slog, txg); } if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); } else { wsz = lwb->lwb_sz; } zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; /* * clear unused data for security */ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); lwb->lwb_state = LWB_STATE_ISSUED; zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); /* * If there was an allocation failure then nlwb will be null which * forces a txg_wait_synced(). */ return (nlwb); } static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrcb, *lrc; lr_write_t *lrwb, *lrw; char *lr_buf; uint64_t dlen, dnow, lwb_sp, reclen, txg; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); lrc = &itx->itx_lr; lrw = (lr_write_t *)lrc; /* * A commit itx doesn't represent any on-disk state; instead * it's simply used as a place holder on the commit list, and * provides a mechanism for attaching a "commit waiter" onto the * correct lwb (such that the waiter can be signalled upon * completion of that lwb). Thus, we don't process this itx's * log record if it's a commit itx (these itx's don't have log * records), and instead link the itx's waiter onto the lwb's * list of waiters. * * For more details, see the comment above zil_commit(). */ if (lrc->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); return (lwb); } if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lrc->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); txg = lrc->lrc_txg; ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_sz - lwb->lwb_nused; if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { lwb = zil_lwb_write_issue(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); lwb_sp = lwb->lwb_sz - lwb->lwb_nused; ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } dnow = MIN(dlen, lwb_sp - reclen); lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lrc->lrc_txtype == TX_WRITE) { if (txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state != WR_COPIED) { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; lrcb->lrc_reclen += dnow; if (lrwb->lr_length > dnow) lrwb->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; } else { ASSERT(itx->itx_wr_state == WR_INDIRECT); dbuf = NULL; } /* * We pass in the "lwb_write_zio" rather than * "lwb_root_zio" so that the "lwb_write_zio" * becomes the parent of any zio's created by * the "zl_get_data" callback. The vdevs are * flushed after the "lwb_write_zio" completes, * so we want to make sure that completion * callback waits for these additional zio's, * such that the vdevs used by those zio's will * be included in the lwb's vdev tree, and those * vdevs will be properly flushed. If we passed * in "lwb_root_zio" here, then these additional * vdevs may not be flushed; e.g. if these zio's * completed after "lwb_write_zio" completed. */ error = zilog->zl_get_data(itx->itx_private, lrwb, dbuf, lwb, lwb->lwb_write_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); } } } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ lrcb->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; zil_lwb_add_txg(lwb, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); dlen -= dnow; if (dlen > 0) { zilog->zl_cur_used += reclen; goto cont; } return (lwb); } itx_t * zil_itx_create(uint64_t txtype, size_t lrsize) { itx_t *itx; lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ return (itx); } void zil_itx_destroy(itx_t *itx) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(itxs_t *itxs) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via * zil_lwb_commit(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * * - a thread calls zil_commit() which assigns the * commit itx to a per-txg i_sync_list * - zil_itxg_clean() is called (e.g. via spa_sync()) * while the waiter is still on the i_sync_list * * There's nothing to prevent syncing the txg while the * waiter is on the i_sync_list. This normally doesn't * happen because spa_sync() is slower than zil_commit(), * but if zil_commit() calls txg_wait_synced() (e.g. * because zil_create() or zil_commit_writer_stall() is * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); list_remove(list, itx); zil_itx_destroy(itx); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; return (AVL_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ static void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Object ids can be re-instantiated in the next txg so * remove any async transactions to avoid future leaks. * This can happen if a fsync occurs on the re-instantiated * object for a WR_INDIRECT or WR_NEED_COPY write, which gets * the new file data and flushes a write record for the old object. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) zil_remove_async(zilog, itx->itx_oid); /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); /* * We don't want to dirty the ZIL using ZILTEST_TXG, because * zil_clean() will never be called using ZILTEST_TXG. Thus, we * need to be careful to always dirty the ZIL using the "real" * TXG (not itxg_txg) even when the SPA is frozen. */ zilog_dirty(zilog, dmu_tx_get_txg(tx)); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been comitted) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) zil_itxg_clean(clean_me); } /* * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ static void zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. That's okay since we'll * only commit things in the future. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If we're adding itx records to the zl_itx_commit_list, * then the zil better be dirty in this "txg". We can assert * that here since we're holding the itxg_lock which will * prevent spa_sync from cleaning it. Once we add the itxs * to the zl_itx_commit_list we must commit it to disk even * if it's unnecessary (i.e. the txg was synced). */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); mutex_exit(&itxg->itxg_lock); } } /* * Move the async itxs for a specified object to commit into sync lists. */ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } /* * This function will prune commit itxs that are at the head of the * commit list (it won't prune past the first non-commit itx), and * either: a) attach them to the last lwb that's still pending * completion, or b) skip them altogether. * * This is used as a performance optimization to prevent commit itxs * from generating new lwbs when it's unnecessary to do so. */ static void zil_prune_commit_list(zilog_t *zilog) { itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); while (itx = list_head(&zilog->zl_itx_commit_list)) { lr_t *lrc = &itx->itx_lr; if (lrc->lrc_txtype != TX_COMMIT) break; mutex_enter(&zilog->zl_lock); lwb_t *last_lwb = zilog->zl_last_lwb_opened; if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* * All of the itxs this waiter was waiting on * must have already completed (or there were * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); itx->itx_private = NULL; } mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } static void zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on * disk, we must call txg_wait_synced() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), * and start a new ZIL chain. * * Since zil_alloc_zil() failed, the lwb that was previously * issued does not have a pointer to the "next" lwb on disk. * Thus, if another ZIL writer thread was to allocate the "next" * on-disk lwb, that block could be leaked in the event of a * crash (because the previous lwb on-disk would not point to * it). * * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); } /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly * created lwbs. Additionally, as a new lwb is created, the previous * lwb will be issued to the zio layer to be written to disk. */ static void zil_process_commit_list(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; list_t nolwb_waiters; lwb_t *lwb; itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_head(&zilog->zl_itx_commit_list) == NULL) return; list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); } else { ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); } while (itx = list_head(&zilog->zl_itx_commit_list)) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; ASSERT3U(txg, !=, 0); if (lrc->lrc_txtype == TX_COMMIT) { DTRACE_PROBE2(zil__process__commit__itx, zilog_t *, zilog, itx_t *, itx); } else { DTRACE_PROBE2(zil__process__normal__itx, zilog_t *, zilog, itx_t *, itx); } boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); /* * If the txg of this itx has already been synced out, then * we don't need to commit this itx to an lwb. This is * because the data of this itx will have already been * written to the main pool. This is inherently racy, and * it's still ok to commit an itx whose txg has already * been synced; this will result in a write that's * unnecessary, but will do no harm. * * With that said, we always want to commit TX_COMMIT itxs * to an lwb, regardless of whether or not that itx's txg * has been synced out. We do this to ensure any OPENED lwb * will always have at least one zil_commit_waiter_t linked * to the lwb. * * As a counter-example, if we skipped TX_COMMIT itx's * whose txg had already been synced, the following * situation could occur if we happened to be racing with * spa_sync: * * 1. we commit a non-TX_COMMIT itx to an lwb, where the * itx's txg is 10 and the last synced txg is 9. * 2. spa_sync finishes syncing out txg 10. * 3. we move to the next itx in the list, it's a TX_COMMIT * whose txg is 10, so we skip it rather than committing * it to the lwb used in (1). * * If the itx that is skipped in (3) is the last TX_COMMIT * itx in the commit list, than it's possible for the lwb * used in (1) to remain in the OPENED state indefinitely. * * To prevent the above scenario from occuring, ensuring * that once an lwb is OPENED it will transition to ISSUED * and eventually DONE, we always commit TX_COMMIT itx's to * an lwb here, even if that itx's txg has already been * synced. * * Finally, if the pool is frozen, we _always_ commit the * itx. The point of freezing the pool is to prevent data * from being written to the main pool via spa_sync, and * instead rely solely on the ZIL to persistently store the * data; i.e. when the pool is frozen, the last synced txg * value can't be trusted. */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_commit(zilog, itx, lwb); } else if (lrc->lrc_txtype == TX_COMMIT) { ASSERT3P(lwb, ==, NULL); zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } } list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } if (lwb == NULL) { /* * This indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ zil_commit_writer_stall(zilog); /* * Additionally, we have to signal and mark the "nolwb" * waiters as "done" here, since without an lwb, we * can't do this via zil_lwb_flush_vdevs_done() like * normal. */ zil_commit_waiter_t *zcw; while (zcw = list_head(&nolwb_waiters)) { zil_commit_waiter_skip(zcw); list_remove(&nolwb_waiters, zcw); } } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * At this point, the ZIL block pointed at by the "lwb" * variable is in one of the following states: "closed" * or "open". * * If its "closed", then no itxs have been committed to * it, so there's no point in issuing its zio (i.e. * it's "empty"). * * If its "open" state, then it contains one or more * itxs that eventually need to be committed to stable * storage. In this case we intentionally do not issue * the lwb's zio to disk yet, and instead rely on one of * the following two mechanisms for issuing the zio: * * 1. Ideally, there will be more ZIL activity occuring * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via * zil_lwb_commit(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occuring on * the system, such that this lwb's zio isn't issued via * zil_lwb_commit(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occuring on the system at that time. * * We do this for a couple of reasons: * * 1. To try and reduce the number of IOPs needed to * write the same number of itxs. If an lwb has space * available in it's buffer for more itxs, and more itxs * will be committed relatively soon (relative to the * latency of performing a write), then it's beneficial * to wait for these "next" itxs. This way, more itxs * can be committed to stable storage with fewer writes. * * 2. To try and use the largest lwb block size that the * incoming rate of itxs can support. Again, this is to * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. */ } } /* * This function is responsible for ensuring the passed in commit waiter * (and associated commit itx) is committed to an lwb. If the waiter is * not already committed to an lwb, all itxs in the zilog's queue of * itxs will be processed. The assumption is the passed in waiter's * commit itx will found in the queue just like the other non-commit * itxs, such that when the entire queue is processed, the waiter will * have been commited to an lwb. * * The lwb associated with the passed in waiter is not guaranteed to * have been issued by the time this function completes. If the lwb is * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { /* * It's possible that, while we were waiting to acquire * the "zl_issuer_lock", another thread committed this * waiter to an lwb. If that occurs, we bail out early, * without processing any of the zilog's queue of itxs. * * On certain workloads and system configurations, the * "zl_issuer_lock" can become highly contended. In an * attempt to reduce this contention, we immediately drop * the lock if the waiter has already been processed. * * We've measured this optimization to reduce CPU spent * contending on this lock by up to 5%, using a system * with 32 CPUs, low latency storage (~50 usec writes), * and 1024 threads performing sync writes. */ goto out; } zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog); out: mutex_exit(&zilog->zl_issuer_lock); } static void zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT3B(zcw->zcw_done, ==, B_FALSE); lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); /* * If the lwb has already been issued by another thread, we can * immediately return since there's no work to be done (the * point of this function is to issue the lwb). Additionally, we * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE) return; /* * In order to call zil_lwb_write_issue() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are aquired in the opposite order * elsewhere. */ mutex_exit(&zcw->zcw_lock); mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zcw->zcw_lock); /* * Since we just dropped and re-acquired the commit waiter's * lock, we have to re-check to see if the waiter was marked * "done" during that process. If the waiter was marked "done", * the "lwb" pointer is no longer valid (it can be free'd after * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ if (zcw->zcw_done) goto out; ASSERT3P(lwb, ==, zcw->zcw_lwb); /* * We've already checked this above, but since we hadn't acquired * the zilog's zl_issuer_lock, we have to perform this check a * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb * _can_ transition from ISSUED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in * the ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_issue() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE) goto out; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); /* * As described in the comments above zil_commit_waiter() and * zil_process_commit_list(), we need to issue this lwb's zio * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); /* * Since the lwb's zio hadn't been issued by the time this thread * reached its timeout, we reset the zilog's "zl_cur_used" field * to influence the zil block size selection algorithm. * * By having to issue the lwb's zio here, it means the size of the * lwb was too large, given the incoming throughput of itxs. By * setting "zl_cur_used" to zero, we communicate this fact to the * block size selection algorithm, so it can take this informaiton * into account, and potentially select a smaller size for the * next lwb block that is allocated. */ zilog->zl_cur_used = 0; if (nlwb == NULL) { /* * When zil_lwb_write_issue() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. * * We must drop the commit waiter's lock prior to * calling zil_commit_writer_stall() or else we can wind * up with the following deadlock: * * - This thread is waiting for the txg to sync while * holding the waiter's lock; txg_wait_synced() is * used within txg_commit_writer_stall(). * * - The txg can't sync because it is waiting for this * lwb's zio callback to call dmu_tx_commit(). * * - The lwb's zio callback can't call dmu_tx_commit() * because it's blocked trying to acquire the waiter's * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } /* * This function is responsible for performing the following two tasks: * * 1. its primary responsibility is to block until the given "commit * waiter" is considered "done". * * 2. its secondary responsibility is to issue the zio for the lwb that * the given "commit waiter" is waiting on, if this function has * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. * * For more details, see zil_process_commit_list(); more specifically, * the comment at the bottom of that function. */ static void zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zcw->zcw_lock); /* * The timeout is scaled based on the lwb latency to avoid * significantly impacting the latency of each individual itx. * For more details, see the comment at the bottom of the * zil_process_commit_list() function. */ int pct = MAX(zfs_commit_timeout_pct, 1); #if defined(illumos) || !defined(_KERNEL) hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; hrtime_t wakeup = gethrtime() + sleep; #else sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); sbintime_t wakeup = getsbinuptime() + sleep; #endif boolean_t timedout = B_FALSE; while (!zcw->zcw_done) { ASSERT(MUTEX_HELD(&zcw->zcw_lock)); lwb_t *lwb = zcw->zcw_lwb; /* * Usually, the waiter will have a non-NULL lwb field here, * but it's possible for it to be NULL as a result of * zil_commit() racing with spa_sync(). * * When zil_clean() is called, it's possible for the itxg * list (which may be cleaned via a taskq) to contain * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be * marked done until zil_commit_waiter_skip() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the * "zcw" variable) to be found in this "in between" state; * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); /* * If the lwb hasn't been issued yet, then we * need to wait with a timeout, in case this * function needs to issue the lwb after the * timeout is reached; responsibility (2) from * the comment above this function. */ #if defined(illumos) || !defined(_KERNEL) clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); if (timeleft >= 0 || zcw->zcw_done) continue; #else int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); if (wait_err != EWOULDBLOCK || zcw->zcw_done) continue; #endif timedout = B_TRUE; zil_commit_waiter_timeout(zilog, zcw); if (!zcw->zcw_done) { /* * If the commit waiter has already been * marked "done", it's possible for the * waiter's lwb structure to have already * been freed. Thus, we can only reliably * make these assertions if the waiter * isn't done. */ ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); } } else { /* * If the lwb isn't open, then it must have already * been issued. In that case, there's no need to * use a timeout when waiting for the lwb to * complete. * * Additionally, if the lwb is NULL, the waiter * will soon be signalled and marked done via * zil_clean() and zil_itxg_clean(), so no timeout * is required. */ IMPLY(lwb != NULL, lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); } } mutex_exit(&zcw->zcw_lock); } static zil_commit_waiter_t * zil_alloc_commit_waiter() { zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; zcw->zcw_zio_error = 0; return (zcw); } static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); kmem_cache_free(zil_zcw_cache, zcw); } /* * This function is used to create a TX_COMMIT itx and assign it. This * way, it will be linked into the ZIL's list of synchronous itxs, and * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; itx->itx_private = zcw; zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); } /* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple * itxs can be committed to a single lwb. Once a lwb is written and * committed to stable storage (i.e. the lwb is written, and vdevs have * been flushed), each itx that was committed to that lwb is also * considered to be committed to stable storage. * * When an itx is committed to an lwb, the log record (lr_t) contained * by the itx is copied into the lwb's zio buffer, and once this buffer * is written to disk, it becomes an on-disk ZIL block. * * As itxs are generated, they're inserted into the ZIL's queue of * uncommitted itxs. The semantics of zil_commit() are such that it will * block until all itxs that were in the queue when it was called, are * committed to stable storage. * * If "foid" is zero, this means all "synchronous" and "asynchronous" * itxs, for all objects in the dataset, will be committed to stable * storage prior to zil_commit() returning. If "foid" is non-zero, all * "synchronous" itxs for all objects, but only "asynchronous" itxs * that correspond to the foid passed in, will be committed to stable * storage prior to zil_commit() returning. * * Generally speaking, when zil_commit() is called, the consumer doesn't * actually care about _all_ of the uncommitted itxs. Instead, they're * simply trying to waiting for a specific itx to be committed to disk, * but the interface(s) for interacting with the ZIL don't allow such * fine-grained communication. A better interface would allow a consumer * to create and assign an itx, and then pass a reference to this itx to * zil_commit(); such that zil_commit() would return as soon as that * specific itx was committed to disk (instead of waiting for _all_ * itxs to be committed). * * When a thread calls zil_commit() a special "commit itx" will be * generated, along with a corresponding "waiter" for this commit itx. * zil_commit() will wait on this waiter's CV, such that when the waiter * is marked done, and signalled, zil_commit() will return. * * This commit itx is inserted into the queue of uncommitted itxs. This * provides an easy mechanism for determining which itxs were in the * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * * The commit it is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signalled * -- allowing the thread waiting on the waiter to return from zil_commit(). * * It's important to point out a few critical factors that allow us * to make use of the commit itxs, commit waiters, per-lwb lists of * commit waiters, and zio completion callbacks like we're doing: * * 1. The list of waiters for each lwb is traversed, and each commit * waiter is marked "done" and signalled, in the zio completion * callback of the lwb's zio[*]. * * * Actually, the waiters are signalled in the zio completion * callback of the root zio for the DKIOCFLUSHWRITECACHE commands * that are sent to the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as * itxs are added to the queue, they are added to the tail of * in-memory linked lists. * * When committing the itxs to lwbs (to be written to disk), they * are committed in the same order in which the itxs were added to * the uncommitted queue's linked list(s); i.e. the linked list of * itxs to commit is traversed from head to tail, and each itx is * committed to an lwb in that order. * * * To clarify: * * - the order of "sync" itxs is preserved w.r.t. other * "sync" itxs, regardless of the corresponding objects. * - the order of "async" itxs is preserved w.r.t. other * "async" itxs corresponding to the same object. * - the order of "async" itxs is *not* preserved w.r.t. other * "async" itxs corresponding to different objects. * - the order of "sync" itxs w.r.t. "async" itxs (or vice * versa) is *not* preserved, even for itxs that correspond * to the same object. * * For more details, see: zil_itx_assign(), zil_async_to_sync(), * zil_get_commit_list(), and zil_process_commit_list(). * * 3. The lwbs represent a linked list of blocks on disk. Thus, any * lwb cannot be considered committed to stable storage, until its * "previous" lwb is also committed to stable storage. This fact, * coupled with the fact described above, means that itxs are * committed in (roughly) the order in which they were generated. * This is essential because itxs are dependent on prior itxs. * Thus, we *must not* deem an itx as being committed to stable * storage, until *all* prior itxs have also been committed to * stable storage. * * To enforce this ordering of lwb zio's, while still leveraging as * much of the underlying storage performance as possible, we rely * on two fundamental concepts: * * 1. The creation and issuance of lwb zio's is protected by * the zilog's "zl_issuer_lock", which ensures only a single * thread is creating and/or issuing lwb's at a time * 2. The "previous" lwb is a child of the "current" lwb * (leveraging the zio parent-child depenency graph) * * By relying on this parent-child zio relationship, we can have * many lwb zio's concurrently issued to the underlying storage, * but the order in which they complete will be the same order in * which they were created. */ void zil_commit(zilog_t *zilog, uint64_t foid) { /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: * * 1. A snapshot may never be modified, thus it cannot have any * in-flight itxs that would have modified the dataset. * * 2. By design, when zil_commit() is called, a commit itx will * be assigned to this zilog; as a result, the zilog will be * dirtied. We must not dirty the zilog of a snapshot; there's * checks in the code that enforce this invariant, and will * cause a panic if it's not upheld. */ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; if (!spa_writeable(zilog->zl_spa)) { /* * If the SPA is not writable, there should never be any * pending itxs waiting to be committed to disk. If that * weren't true, we'd skip writing those itxs out, and * would break the sematics of zil_commit(); thus, we're * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); return; } /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out * lwbs like would be done in zil_commit_write(). Thus, we * simply rely on txg_wait_synced() to maintain the necessary * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); } void zil_commit_impl(zilog_t *zilog, uint64_t foid) { /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) * to an lwb when zil_process_commit_list() is called. * * Since these "async" itxs must be committed prior to this * call to zil_commit returning, we must perform this operation * before we call zil_commit_itx_assign(). */ zil_async_to_sync(zilog, foid); /* * We allocate a new "waiter" structure which will initially be * linked to the commit itx using the itx's "itx_private" field. * Since the commit itx doesn't represent any on-disk state, * when it's committed to an lwb, rather than copying the its * lr_t into the lwb's buffer, the commit itx's "waiter" will be * added to the lwb's list of waiters. Then, when the lwb is * committed to stable storage, each waiter in the lwb's list of * waiters will be marked "done", and signalled. * * We must create the waiter and assign the commit itx prior to * calling zil_commit_writer(), or else our specific commit itx * is not guaranteed to be committed to an lwb prior to calling * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to * relying on spa_sync() to write out the data this * thread is waiting on. Obviously this has performance * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } zil_free_commit_waiter(zcw); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } /* ARGSUSED */ static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { lwb_t *lwb = vbuf; list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } /* ARGSUSED */ static void zil_lwb_dest(void *vbuf, void *unused) { lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zil_fini(void) { kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (int i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); cv_destroy(&zilog->zl_cv_suspend); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); ASSERT3P(zilog->zl_get_data, ==, NULL); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) txg = zilog->zl_dirty_max_txg; else txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); mutex_exit(&zilog->zl_lock); /* * We need to use txg_wait_synced() to wait long enough for the * ZIL to be clean, and to wait for all pending lwbs to be * written out. */ if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); - if (zilog_is_dirty(zilog)) - zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); if (txg < spa_freeze_txg(zilog->zl_spa)) - VERIFY(!zilog_is_dirty(zilog)); + ASSERT(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; /* * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } static char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); /* * We need to use zil_commit_impl to ensure we wait for all * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. */ zil_commit_impl(zilog, 0); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t **zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } /* ARGSUSED */ static int zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. */ void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } /* ARGSUSED */ int zil_reset(const char *osname, void *arg) { int error; error = zil_suspend(osname, NULL); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 351076) +++ head/sys/cddl/contrib/opensolaris (revision 351077) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,1 +0,0 ## Reverse-merged /vendor-sys/illumos/dist:r350898,351075