diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 32d29fd44814..eedd17c30710 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -1,668 +1,569 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* * zhack is a debugging tool that can write changes to ZFS pool using libzpool * for testing purposes. Altering pools with zhack is unsupported and may * result in corrupted pools. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef ZFS_MAXNAMELEN #include extern boolean_t zfeature_checks_disable; const char cmdname[] = "zhack"; libzfs_handle_t *g_zfs; static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; -static boolean_t g_force = B_FALSE; static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-c cachefile] [-d dir] [-f] ...\n" + "Usage: %s [-c cachefile] [-d dir] ...\n" "where is one of the following:\n" "\n", cmdname); (void) fprintf(stderr, " feature stat \n" " print information about enabled features\n" " feature enable [-d desc] \n" " add a new enabled feature to the pool\n" " -d sets the feature's description\n" - " feature disable \n" - " remove an enabled, but not active, feature\n" - " from the pool.\n" " feature ref [-md] \n" " change the refcount on the given feature\n" " -d decrease instead of increase the refcount\n" " -m add the feature to the label if increasing refcount\n" "\n" " : should be a feature guid\n"); exit(1); } static void fatal(spa_t *spa, void *tag, const char *fmt, ...) { va_list ap; if (spa != NULL) { spa_close(spa, tag); (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE); } va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); exit(1); } /* ARGSUSED */ static int space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (ENOENT); (void) fprintf(stderr, "modifying object that needs user accounting"); abort(); /* NOTREACHED */ } /* * Target is the dataset whose pool we want to open. */ static void import_pool(const char *target, boolean_t readonly) { nvlist_t *config; nvlist_t *pools; int error; char *sepp; spa_t *spa; nvpair_t *elem; nvlist_t *props; char *name; kernel_init(readonly ? FREAD : (FREAD | FWRITE)); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); g_readonly = readonly; /* * If we only want readonly access, it's OK if we find * a potentially-active (ie, imported into the kernel) pool from the * default cachefile. */ if (readonly && spa_open(target, &spa, FTAG) == 0) { spa_close(spa, FTAG); return; } g_importargs.unique = B_TRUE; g_importargs.can_be_active = readonly; g_pool = strdup(target); if ((sepp = strpbrk(g_pool, "/@")) != NULL) *sepp = '\0'; g_importargs.poolname = g_pool; pools = zpool_search_import(g_zfs, &g_importargs); if (nvlist_empty(pools)) { if (!g_importargs.can_be_active) { g_importargs.can_be_active = B_TRUE; if (zpool_search_import(g_zfs, &g_importargs) != NULL || spa_open(target, &spa, FTAG) == 0) { fatal(spa, FTAG, "cannot import '%s': pool is " "active; run " "\"zpool export %s\" " "first\n", g_pool, g_pool); } } fatal(NULL, FTAG, "cannot import '%s': no such pool " "available\n", g_pool); } elem = nvlist_next_nvpair(pools, NULL); name = nvpair_name(elem); VERIFY(nvpair_value_nvlist(elem, &config) == 0); props = NULL; if (readonly) { VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); } zfeature_checks_disable = B_TRUE; error = spa_import(name, config, props, ZFS_IMPORT_NORMAL); zfeature_checks_disable = B_FALSE; if (error == EEXIST) error = 0; if (error) fatal(NULL, FTAG, "can't import '%s': %s", name, strerror(error)); } static void zhack_spa_open(const char *target, boolean_t readonly, void *tag, spa_t **spa) { int err; import_pool(target, readonly); zfeature_checks_disable = B_TRUE; err = spa_open(target, spa, tag); zfeature_checks_disable = B_FALSE; if (err != 0) fatal(*spa, FTAG, "cannot open '%s': %s", target, strerror(err)); if (spa_version(*spa) < SPA_VERSION_FEATURES) { fatal(*spa, FTAG, "'%s' has version %d, features not enabled", target, (int)spa_version(*spa)); } } static void dump_obj(objset_t *os, uint64_t obj, const char *name) { zap_cursor_t zc; zap_attribute_t za; (void) printf("%s_obj:\n", name); for (zap_cursor_init(&zc, os, obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { if (za.za_integer_length == 8) { ASSERT(za.za_num_integers == 1); (void) printf("\t%s = %llu\n", za.za_name, (u_longlong_t)za.za_first_integer); } else { ASSERT(za.za_integer_length == 1); char val[1024]; VERIFY(zap_lookup(os, obj, za.za_name, 1, sizeof (val), val) == 0); (void) printf("\t%s = %s\n", za.za_name, val); } } zap_cursor_fini(&zc); } static void dump_mos(spa_t *spa) { nvlist_t *nv = spa->spa_label_features; nvpair_t *pair; (void) printf("label config:\n"); for (pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { (void) printf("\t%s\n", nvpair_name(pair)); } } static void zhack_do_feature_stat(int argc, char **argv) { spa_t *spa; objset_t *os; char *target; argc--; argv++; if (argc < 1) { (void) fprintf(stderr, "error: missing pool name\n"); usage(); } target = argv[0]; zhack_spa_open(target, B_TRUE, FTAG, &spa); os = spa->spa_meta_objset; dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg"); } dump_mos(spa); spa_close(spa, FTAG); } static void zhack_feature_enable_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; zfeature_info_t *feature = arg; feature_enable_sync(spa, feature, tx); spa_history_log_internal(spa, "zhack enable feature", tx, "name=%s flags=%u", feature->fi_guid, feature->fi_flags); } static void zhack_do_feature_enable(int argc, char **argv) { char c; char *desc, *target; spa_t *spa; objset_t *mos; zfeature_info_t feature; spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; /* * Features are not added to the pool's label until their refcounts * are incremented, so fi_mos can just be left as false for now. */ desc = NULL; feature.fi_uname = "zhack"; feature.fi_flags = 0; feature.fi_depends = nodeps; feature.fi_feature = SPA_FEATURE_NONE; optind = 1; while ((c = getopt(argc, argv, "rmd:")) != -1) { switch (c) { case 'r': feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; break; case 'd': desc = strdup(optarg); break; default: usage(); break; } } if (desc == NULL) desc = strdup("zhack injected"); feature.fi_desc = desc; argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, "error: missing feature or pool name\n"); usage(); } target = argv[0]; feature.fi_guid = argv[1]; if (!zfeature_is_valid_guid(feature.fi_guid)) fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); zhack_spa_open(target, B_FALSE, FTAG, &spa); mos = spa->spa_meta_objset; - if (zfeature_is_supported(feature.fi_guid) && (g_force == B_FALSE)) - fatal(spa, FTAG, - "'%s' is a real feature, will not enable\n" - "provide the -f option to force override", feature.fi_guid); + if (zfeature_is_supported(feature.fi_guid)) + fatal(spa, FTAG, "'%s' is a real feature, will not enable"); if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) fatal(spa, FTAG, "feature already enabled: %s", feature.fi_guid); VERIFY0(dsl_sync_task(spa_name(spa), NULL, zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); spa_close(spa, FTAG); free(desc); } -static void -zhack_feature_disable_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - zfeature_info_t *feature = arg; - - feature_disable_sync(spa, feature, tx); - - spa_history_log_internal(spa, "zhack disable feature", tx, - "name=%s can_readonly=%u", - feature->fi_guid, feature->fi_can_readonly); -} - -static void -zhack_do_feature_disable(int argc, char **argv) -{ - char c; - char *target; - uint64_t count; - spa_t *spa; - objset_t *mos; - zfeature_info_t feature; - spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; - - /* - * fi_desc does not matter here because it was written to disk - * when the feature was enabled, but we need to properly set the - * feature for read or write based on the information we read off - * disk later. - */ - feature.fi_uname = "zhack"; - feature.fi_mos = B_TRUE; - feature.fi_desc = NULL; - feature.fi_depends = nodeps; - feature.fi_feature = SPA_FEATURE_NONE; - - optind = 1; - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - default: - usage(); - break; - } - } - argc -= optind; - argv += optind; - - if (argc < 2) { - (void) fprintf(stderr, "error: missing feature or pool name\n"); - usage(); - } - target = argv[0]; - feature.fi_guid = argv[1]; - - if (!zfeature_is_valid_guid(feature.fi_guid)) - fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); - - zhack_spa_open(target, B_FALSE, FTAG, &spa); - mos = spa->spa_meta_objset; - - if (zfeature_is_supported(feature.fi_guid) && (g_force == B_FALSE)) { - fatal(spa, FTAG, - "'%s' is a real feature, will not disable\n" - "provide the -f option to force override", feature.fi_guid); - } - - if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, - feature.fi_guid)) { - feature.fi_can_readonly = B_FALSE; - } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, - feature.fi_guid)) { - feature.fi_can_readonly = B_TRUE; - } else { - fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid); - } - - if (feature_get_refcount_from_disk(spa, &feature, &count) == 0 && - count > 0) { - fatal(spa, FTAG, "feature '%s' is active, can not disable", - feature.fi_guid); - } - - VERIFY0(dsl_sync_task(spa_name(spa), NULL, - zhack_feature_disable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); - - spa_close(spa, FTAG); -} - static void feature_incr_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; zfeature_info_t *feature = arg; uint64_t refcount; VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount + 1, tx); spa_history_log_internal(spa, "zhack feature incr", tx, "name=%s", feature->fi_guid); } static void feature_decr_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; zfeature_info_t *feature = arg; uint64_t refcount; VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount - 1, tx); spa_history_log_internal(spa, "zhack feature decr", tx, "name=%s", feature->fi_guid); } static void zhack_do_feature_ref(int argc, char **argv) { char c; char *target; boolean_t decr = B_FALSE; spa_t *spa; objset_t *mos; zfeature_info_t feature; spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; /* * fi_desc does not matter here because it was written to disk * when the feature was enabled, but we need to properly set the * feature for read or write based on the information we read off * disk later. */ feature.fi_uname = "zhack"; feature.fi_flags = 0; feature.fi_desc = NULL; feature.fi_depends = nodeps; feature.fi_feature = SPA_FEATURE_NONE; optind = 1; while ((c = getopt(argc, argv, "md")) != -1) { switch (c) { case 'm': feature.fi_flags |= ZFEATURE_FLAG_MOS; break; case 'd': decr = B_TRUE; break; default: usage(); break; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, "error: missing feature or pool name\n"); usage(); } target = argv[0]; feature.fi_guid = argv[1]; if (!zfeature_is_valid_guid(feature.fi_guid)) fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); zhack_spa_open(target, B_FALSE, FTAG, &spa); mos = spa->spa_meta_objset; if (zfeature_is_supported(feature.fi_guid)) { fatal(spa, FTAG, "'%s' is a real feature, will not change refcount"); } if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, feature.fi_guid)) { feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT; } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, feature.fi_guid)) { feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; } else { fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid); } if (decr) { uint64_t count; if (feature_get_refcount_from_disk(spa, &feature, &count) == 0 && count == 0) { fatal(spa, FTAG, "feature refcount already 0: %s", feature.fi_guid); } } VERIFY0(dsl_sync_task(spa_name(spa), NULL, decr ? feature_decr_sync : feature_incr_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); spa_close(spa, FTAG); } static int zhack_do_feature(int argc, char **argv) { char *subcommand; argc--; argv++; if (argc == 0) { (void) fprintf(stderr, "error: no feature operation specified\n"); usage(); } subcommand = argv[0]; if (strcmp(subcommand, "stat") == 0) { zhack_do_feature_stat(argc, argv); } else if (strcmp(subcommand, "enable") == 0) { zhack_do_feature_enable(argc, argv); - } else if (strcmp(subcommand, "disable") == 0) { - zhack_do_feature_disable(argc, argv); } else if (strcmp(subcommand, "ref") == 0) { zhack_do_feature_ref(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); usage(); } return (0); } #define MAX_NUM_PATHS 1024 int main(int argc, char **argv) { extern void zfs_prop_init(void); char *path[MAX_NUM_PATHS]; const char *subcommand; int rv = 0; char c; g_importargs.path = path; dprintf_setup(&argc, argv); zfs_prop_init(); - while ((c = getopt(argc, argv, "c:d:f")) != -1) { + while ((c = getopt(argc, argv, "c:d:")) != -1) { switch (c) { case 'c': g_importargs.cachefile = optarg; break; case 'd': assert(g_importargs.paths < MAX_NUM_PATHS); g_importargs.path[g_importargs.paths++] = optarg; break; - case 'f': - g_force = B_TRUE; - break; default: usage(); break; } } argc -= optind; argv += optind; optind = 1; if (argc == 0) { (void) fprintf(stderr, "error: no command specified\n"); usage(); } subcommand = argv[0]; if (strcmp(subcommand, "feature") == 0) { rv = zhack_do_feature(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); usage(); } if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) { fatal(NULL, FTAG, "pool export failed; " "changes may not be committed to disk\n"); } libzfs_fini(g_zfs); kernel_fini(); return (rv); } diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h index 5ea77f847e10..5abde149a615 100644 --- a/include/sys/zfeature.h +++ b/include/sys/zfeature.h @@ -1,75 +1,73 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_ZFEATURE_H #define _SYS_ZFEATURE_H #include #include #include "zfeature_common.h" #ifdef __cplusplus extern "C" { #endif #define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES) #define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \ VALID_FEATURE_FID(fid)) struct spa; struct dmu_tx; struct objset; extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); extern void spa_feature_enable(struct spa *, spa_feature_t, struct dmu_tx *); extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *); extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *); extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t); extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t); extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg); extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t); extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); /* * These functions are only exported for zhack and zdb; normal callers should * use the above interfaces. */ extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *); extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, uint64_t *res); extern void feature_enable_sync(struct spa *, zfeature_info_t *, struct dmu_tx *); -extern void feature_disable_sync(struct spa *, zfeature_info_t *, - struct dmu_tx *); extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t, struct dmu_tx *); #ifdef __cplusplus } #endif #endif /* _SYS_ZFEATURE_H */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index f14552374b6c..89e474c65d44 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1,1418 +1,1420 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Emulation of kernel services in userland. */ int aok; uint64_t physmem; vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; vmem_t *zio_arena = NULL; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= */ pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; pthread_key_t kthread_key; int kthread_nr = 0; void thread_init(void) { kthread_t *kt; VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0); /* Create entry for primary kthread */ kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); kt->t_tid = pthread_self(); kt->t_func = NULL; VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); /* Only the main thread should be running at the moment */ ASSERT3S(kthread_nr, ==, 0); kthread_nr = 1; } void thread_fini(void) { kthread_t *kt = curthread; ASSERT(pthread_equal(kt->t_tid, pthread_self())); ASSERT3P(kt->t_func, ==, NULL); umem_free(kt, sizeof (kthread_t)); /* Wait for all threads to exit via thread_exit() */ VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); kthread_nr--; /* Main thread is exiting */ while (kthread_nr > 0) VERIFY0(pthread_cond_wait(&kthread_cond, &kthread_lock)); ASSERT3S(kthread_nr, ==, 0); VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); VERIFY3S(pthread_key_delete(kthread_key), ==, 0); } kthread_t * zk_thread_current(void) { kthread_t *kt = pthread_getspecific(kthread_key); ASSERT3P(kt, !=, NULL); return (kt); } void * zk_thread_helper(void *arg) { kthread_t *kt = (kthread_t *) arg; VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); kthread_nr++; VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); (void) setpriority(PRIO_PROCESS, 0, kt->t_pri); kt->t_tid = pthread_self(); ((thread_func_arg_t) kt->t_func)(kt->t_arg); /* Unreachable, thread must exit with thread_exit() */ abort(); return (NULL); } kthread_t * zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, size_t len, proc_t *pp, int state, pri_t pri, int detachstate) { kthread_t *kt; pthread_attr_t attr; char *stkstr; ASSERT0(state & ~TS_RUN); kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); kt->t_func = func; kt->t_arg = arg; kt->t_pri = pri; VERIFY0(pthread_attr_init(&attr)); VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); /* * We allow the default stack size in user space to be specified by * setting the ZFS_STACK_SIZE environment variable. This allows us * the convenience of observing and debugging stack overruns in * user space. Explicitly specified stack sizes will be honored. * The usage of ZFS_STACK_SIZE is discussed further in the * ENVIRONMENT VARIABLES sections of the ztest(1) man page. */ if (stksize == 0) { stkstr = getenv("ZFS_STACK_SIZE"); if (stkstr == NULL) stksize = TS_STACK_MAX; else stksize = MAX(atoi(stkstr), TS_STACK_MIN); } VERIFY3S(stksize, >, 0); stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); /* * If this ever fails, it may be because the stack size is not a * multiple of system page size. */ VERIFY0(pthread_attr_setstacksize(&attr, stksize)); VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); VERIFY0(pthread_create(&kt->t_tid, &attr, &zk_thread_helper, kt)); VERIFY0(pthread_attr_destroy(&attr)); return (kt); } void zk_thread_exit(void) { kthread_t *kt = curthread; ASSERT(pthread_equal(kt->t_tid, pthread_self())); umem_free(kt, sizeof (kthread_t)); VERIFY0(pthread_mutex_lock(&kthread_lock)); kthread_nr--; VERIFY0(pthread_mutex_unlock(&kthread_lock)); VERIFY0(pthread_cond_broadcast(&kthread_cond)); pthread_exit((void *)TS_MAGIC); } void zk_thread_join(kt_did_t tid) { void *ret; pthread_join((pthread_t)tid, &ret); VERIFY3P(ret, ==, (void *)TS_MAGIC); } /* * ========================================================================= * kstats * ========================================================================= */ /*ARGSUSED*/ kstat_t * kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } /*ARGSUSED*/ void kstat_install(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_delete(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_waitq_enter(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_waitq_exit(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_enter(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_exit(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_waitq_to_runq(kstat_io_t *kiop) {} /*ARGSUSED*/ void kstat_runq_back_to_waitq(kstat_io_t *kiop) {} void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) {} /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { ASSERT3S(type, ==, MUTEX_DEFAULT); ASSERT3P(cookie, ==, NULL); mp->m_owner = MTX_INIT; mp->m_magic = MTX_MAGIC; VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0); } void mutex_destroy(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, ==, MTX_INIT); ASSERT0(pthread_mutex_destroy(&(mp)->m_lock)); mp->m_owner = MTX_DEST; mp->m_magic = 0; } void mutex_enter(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, !=, MTX_DEST); ASSERT3P(mp->m_owner, !=, curthread); VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0); ASSERT3P(mp->m_owner, ==, MTX_INIT); mp->m_owner = curthread; } int mutex_tryenter(kmutex_t *mp) { int err; ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, !=, MTX_DEST); if (0 == (err = pthread_mutex_trylock(&mp->m_lock))) { ASSERT3P(mp->m_owner, ==, MTX_INIT); mp->m_owner = curthread; return (1); } else { VERIFY3S(err, ==, EBUSY); return (0); } } void mutex_exit(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0); } void * mutex_owner(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); return (mp->m_owner); } int mutex_held(kmutex_t *mp) { return (mp->m_owner == curthread); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { ASSERT3S(type, ==, RW_DEFAULT); ASSERT3P(arg, ==, NULL); VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0); rwlp->rw_owner = RW_INIT; rwlp->rw_wr_owner = RW_INIT; rwlp->rw_readers = 0; rwlp->rw_magic = RW_MAGIC; } void rw_destroy(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); ASSERT(rwlp->rw_readers == 0 && rwlp->rw_wr_owner == RW_INIT); VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0); rwlp->rw_magic = 0; } void rw_enter(krwlock_t *rwlp, krw_t rw) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); ASSERT3P(rwlp->rw_owner, !=, curthread); ASSERT3P(rwlp->rw_wr_owner, !=, curthread); if (rw == RW_READER) { VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0); ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0); ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); ASSERT3U(rwlp->rw_readers, ==, 0); rwlp->rw_wr_owner = curthread; } rwlp->rw_owner = curthread; } void rw_exit(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); ASSERT(RW_LOCK_HELD(rwlp)); if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_wr_owner = RW_INIT; rwlp->rw_owner = RW_INIT; VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int rv; ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); if (rw == RW_READER) rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else rv = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (rv == 0) { ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else { ASSERT3U(rwlp->rw_readers, ==, 0); rwlp->rw_wr_owner = curthread; } rwlp->rw_owner = curthread; return (1); } VERIFY3S(rv, ==, EBUSY); return (0); } int rw_tryupgrade(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { ASSERT3S(type, ==, CV_DEFAULT); cv->cv_magic = CV_MAGIC; VERIFY0(pthread_cond_init(&cv->cv, NULL)); } void cv_destroy(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY0(pthread_cond_destroy(&cv->cv)); cv->cv_magic = 0; } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; VERIFY0(pthread_cond_wait(&cv->cv, &mp->m_lock)); mp->m_owner = curthread; } clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; timestruc_t ts; clock_t delta; ASSERT3U(cv->cv_magic, ==, CV_MAGIC); delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); mp->m_owner = curthread; if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } /*ARGSUSED*/ clock_t cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { int error; timestruc_t ts; hrtime_t delta; ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); delta = tim; if (flag & CALLOUT_FLAG_ABSOLUTE) delta -= gethrtime(); if (delta <= 0) return (-1); ts.tv_sec = delta / NANOSEC; ts.tv_nsec = delta % NANOSEC; ASSERT(mutex_owner(mp) == curthread); mp->m_owner = NULL; error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); mp->m_owner = curthread; if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } void cv_signal(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY0(pthread_cond_signal(&cv->cv)); } void cv_broadcast(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY0(pthread_cond_broadcast(&cv->cv)); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * Note: for the xxxat() versions of these functions, we assume that the * starting vp is always rootdir (which is true for spa_directory.c, the only * ZFS consumer of these interfaces). We assert this is true, and then emulate * them by adding '/' in front of the path. */ /*ARGSUSED*/ int vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) { int fd; int dump_fd; vnode_t *vp; int old_umask = 0; char *realpath; struct stat64 st; int err; realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); /* * If we're accessing a real disk from userland, we need to use * the character interface to avoid caching. This is particularly * important if we're trying to look at a real in-kernel storage * pool from userland, e.g. via zdb, because otherwise we won't * see the changes occurring under the segmap cache. * On the other hand, the stupid character device returns zero * for its size. So -- gag -- we open the block device to get * its size, and remember it for subsequent VOP_GETATTR(). */ #if defined(__sun__) || defined(__sun) if (strncmp(path, "/dev/", 5) == 0) { #else if (0) { #endif char *dsk; fd = open64(path, O_RDONLY); if (fd == -1) { err = errno; free(realpath); return (err); } if (fstat64(fd, &st) == -1) { err = errno; close(fd); free(realpath); return (err); } close(fd); (void) sprintf(realpath, "%s", path); dsk = strstr(path, "/dsk/"); if (dsk != NULL) (void) sprintf(realpath + (dsk - path) + 1, "r%s", dsk + 1); } else { (void) sprintf(realpath, "%s", path); if (!(flags & FCREAT) && stat64(realpath, &st) == -1) { err = errno; free(realpath); return (err); } } if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) { #ifdef __linux__ flags |= O_DIRECT; #endif + /* We shouldn't be writing to block devices in userspace */ + VERIFY(!(flags & FWRITE)); } if (flags & FCREAT) old_umask = umask(0); /* * The construct 'flags - FREAD' conveniently maps combinations of * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. */ fd = open64(realpath, flags - FREAD, mode); err = errno; if (flags & FCREAT) (void) umask(old_umask); if (vn_dumpdir != NULL) { char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); (void) snprintf(dumppath, MAXPATHLEN, "%s/%s", vn_dumpdir, basename(realpath)); dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); umem_free(dumppath, MAXPATHLEN); if (dump_fd == -1) { err = errno; free(realpath); close(fd); return (err); } } else { dump_fd = -1; } free(realpath); if (fd == -1) return (err); if (fstat64_blk(fd, &st) == -1) { err = errno; close(fd); return (err); } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); vp->v_fd = fd; vp->v_size = st.st_size; vp->v_path = spa_strdup(path); vp->v_dump_fd = dump_fd; return (0); } /*ARGSUSED*/ int vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3, vnode_t *startvp, int fd) { char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); int ret; ASSERT(startvp == rootdir); (void) sprintf(realpath, "/%s", path); /* fd ignored for now, need if want to simulate nbmand support */ ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); umem_free(realpath, strlen(path) + 2); return (ret); } /*ARGSUSED*/ int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) { ssize_t rc, done = 0, split; if (uio == UIO_READ) { rc = pread64(vp->v_fd, addr, len, offset); if (vp->v_dump_fd != -1) { int status; status = pwrite64(vp->v_dump_fd, addr, rc, offset); ASSERT(status != -1); } } else { /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. */ int sectors = len >> SPA_MINBLOCKSHIFT; split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; rc = pwrite64(vp->v_fd, addr, split, offset); if (rc != -1) { done = rc; rc = pwrite64(vp->v_fd, (char *)addr + split, len - split, offset + split); } } #ifdef __linux__ if (rc == -1 && errno == EINVAL) { /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order to * catch the offender. */ abort(); } #endif if (rc == -1) return (errno); done += rc; if (residp) *residp = len - done; else if (done != len) return (EIO); return (0); } void vn_close(vnode_t *vp) { close(vp->v_fd); if (vp->v_dump_fd != -1) close(vp->v_dump_fd); spa_strfree(vp->v_path); umem_free(vp, sizeof (vnode_t)); } /* * At a minimum we need to update the size since vdev_reopen() * will no longer call vn_openat(). */ int fop_getattr(vnode_t *vp, vattr_t *vap) { struct stat64 st; int err; if (fstat64_blk(vp->v_fd, &st) == -1) { err = errno; close(vp->v_fd); return (err); } vap->va_size = st.st_size; return (0); } /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; if (dprintf_string != NULL) zfs_flags |= ZFS_DEBUG_DPRINTF; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } if (dprintf_print_all || dprintf_find_string(newfile) || dprintf_find_string(func)) { /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%u ", (uint_t) pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("%s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } } /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } /*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * kobj interfaces * ========================================================================= */ struct _buf * kobj_open_file(char *name) { struct _buf *file; vnode_t *vp; /* set vp as the _fd field of the file */ if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1) != 0) return ((void *)-1UL); file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); file->_fd = (intptr_t)vp; return (file); } int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) { ssize_t resid; if (vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, UIO_SYSSPACE, 0, 0, 0, &resid) != 0) return (-1); return (size - resid); } void kobj_close_file(struct _buf *file) { vn_close((vnode_t *)file->_fd); umem_free(file, sizeof (struct _buf)); } int kobj_get_filesize(struct _buf *file, uint64_t *size) { struct stat64 st; vnode_t *vp = (vnode_t *)file->_fd; if (fstat64(vp->v_fd, &st) == -1) { vn_close(vp); return (errno); } *size = st.st_size; return (0); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * High order bit is 31 (or 63 in _LP64 kernel). */ int highbit64(uint64_t i) { register int h = 1; if (i == 0) return (0); if (i & 0xffffffff00000000ULL) { h += 32; i >>= 32; } if (i & 0xffff0000) { h += 16; i >>= 16; } if (i & 0xff00) { h += 8; i >>= 8; } if (i & 0xf0) { h += 4; i >>= 4; } if (i & 0xc) { h += 2; i >>= 2; } if (i & 0x2) { h += 1; } return (h); } /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. * This is basically a reimplementation of ffsll(), which is GNU specific. */ int lowbit64(uint64_t i) { register int h = 64; if (i == 0) return (0); if (i & 0x00000000ffffffffULL) h -= 32; else i >>= 32; if (i & 0x0000ffff) h -= 16; else i >>= 16; if (i & 0x00ff) h -= 8; else i >>= 8; if (i & 0x0f) h -= 4; else i >>= 4; if (i & 0x3) h -= 2; else i >>= 2; if (i & 0x1) h -= 1; return (h); } static int random_fd = -1, urandom_fd = -1; static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) { char *end; *result = strtoul(hw_serial, &end, base); if (*result == 0) return (errno); return (0); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { char *end; *result = strtoull(str, &end, base); if (*result == 0) return (errno); return (0); } utsname_t * utsname(void) { return (&hw_utsname); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } static unsigned long get_spl_hostid(void) { FILE *f; unsigned long hostid; f = fopen("/sys/module/spl/parameters/spl_hostid", "r"); if (!f) return (0); if (fscanf(f, "%lu", &hostid) != 1) hostid = 0; fclose(f); return (hostid & 0xffffffff); } unsigned long get_system_hostid(void) { unsigned long system_hostid = get_spl_hostid(); if (system_hostid == 0) system_hostid = gethostid() & 0xffffffff; return (system_hostid); } void kernel_init(int mode) { extern uint_t rrw_tsd_key; umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", (mode & FWRITE) ? get_system_hostid() : 0); VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); VERIFY0(uname(&hw_utsname)); thread_init(); system_taskq_init(); spa_init(mode); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); } void kernel_fini(void) { spa_fini(); system_taskq_fini(); thread_fini(); close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } uid_t crgetuid(cred_t *cr) { return (0); } uid_t crgetruid(cred_t *cr) { return (0); } gid_t crgetgid(cred_t *cr) { return (0); } int crgetngroups(cred_t *cr) { return (0); } gid_t * crgetgroups(cred_t *cr) { return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } char * kmem_vasprintf(const char *fmt, va_list adx) { char *buf = NULL; va_list adx_copy; va_copy(adx_copy, adx); VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); va_end(adx_copy); return (buf); } char * kmem_asprintf(const char *fmt, ...) { char *buf = NULL; va_list adx; va_start(adx, fmt); VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* ARGSUSED */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; return (0); } /* ARGSUSED */ void zfs_onexit_fd_rele(int fd) { } /* ARGSUSED */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { return (0); } /* ARGSUSED */ int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) { return (0); } /* ARGSUSED */ int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) { return (0); } fstrans_cookie_t spl_fstrans_mark(void) { return ((fstrans_cookie_t) 0); } void spl_fstrans_unmark(fstrans_cookie_t cookie) { } int spl_fstrans_check(void) { return (0); } void zvol_create_minors(spa_t *spa, const char *name, boolean_t async) { } void zvol_remove_minor(spa_t *spa, const char *name, boolean_t async) { } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c index c83b145687d1..bda9548293d0 100644 --- a/module/zfs/zfeature.c +++ b/module/zfs/zfeature.c @@ -1,551 +1,514 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include "zfeature_common.h" #include /* * ZFS Feature Flags * ----------------- * * ZFS feature flags are used to provide fine-grained versioning to the ZFS * on-disk format. Once enabled on a pool feature flags replace the old * spa_version() number. * * Each new on-disk format change will be given a uniquely identifying string * guid rather than a version number. This avoids the problem of different * organizations creating new on-disk formats with the same version number. To * keep feature guids unique they should consist of the reverse dns name of the * organization which implemented the feature and a short name for the feature, * separated by a colon (e.g. com.delphix:async_destroy). * * Reference Counts * ---------------- * * Within each pool features can be in one of three states: disabled, enabled, * or active. These states are differentiated by a reference count stored on * disk for each feature: * * 1) If there is no reference count stored on disk the feature is disabled. * 2) If the reference count is 0 a system administrator has enabled the * feature, but the feature has not been used yet, so no on-disk * format changes have been made. * 3) If the reference count is greater than 0 the feature is active. * The format changes required by the feature are currently on disk. * Note that if the feature's format changes are reversed the feature * may choose to set its reference count back to 0. * * Feature flags makes no differentiation between non-zero reference counts * for an active feature (e.g. a reference count of 1 means the same thing as a * reference count of 27834721), but feature implementations may choose to use * the reference count to store meaningful information. For example, a new RAID * implementation might set the reference count to the number of vdevs using * it. If all those disks are removed from the pool the feature goes back to * having a reference count of 0. * * It is the responsibility of the individual features to maintain a non-zero * reference count as long as the feature's format changes are present on disk. * * Dependencies * ------------ * * Each feature may depend on other features. The only effect of this * relationship is that when a feature is enabled all of its dependencies are * automatically enabled as well. Any future work to support disabling of * features would need to ensure that features cannot be disabled if other * enabled features depend on them. * * On-disk Format * -------------- * * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES * (5000). In order for this to work the pool is automatically upgraded to * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk * format changes will be in use. * * Information about features is stored in 3 ZAP objects in the pool's MOS. * These objects are linked to by the following names in the pool directory * object: * * 1) features_for_read: feature guid -> reference count * Features needed to open the pool for reading. * 2) features_for_write: feature guid -> reference count * Features needed to open the pool for writing. * 3) feature_descriptions: feature guid -> descriptive string * A human readable string. * * All enabled features appear in either features_for_read or * features_for_write, but not both. * * To open a pool in read-only mode only the features listed in * features_for_read need to be supported. * * To open the pool in read-write mode features in both features_for_read and * features_for_write need to be supported. * * Some features may be required to read the ZAP objects containing feature * information. To allow software to check for compatibility with these features * before the pool is opened their names must be stored in the label in a * new "features_for_read" entry (note that features that are only required * to write to a pool never need to be stored in the label since the * features_for_write ZAP object can be read before the pool is written to). * To save space in the label features must be explicitly marked as needing to * be written to the label. Also, reference counts are not stored in the label, * instead any feature whose reference count drops to 0 is removed from the * label. * * Adding New Features * ------------------- * * Features must be registered in zpool_feature_init() function in * zfeature_common.c using the zfeature_register() function. This function * has arguments to specify if the feature should be stored in the * features_for_read or features_for_write ZAP object and if it needs to be * written to the label when active. * * Once a feature is registered it will appear as a "feature@" * property which can be set by an administrator. Feature implementors should * use the spa_feature_is_enabled() and spa_feature_is_active() functions to * query the state of a feature and the spa_feature_incr() and * spa_feature_decr() functions to change an enabled feature's reference count. * Reference counts may only be updated in the syncing context. * * Features may not perform enable-time initialization. Instead, any such * initialization should occur when the feature is first used. This design * enforces that on-disk changes be made only when features are used. Code * should only check if a feature is enabled using spa_feature_is_enabled(), * not by relying on any feature specific metadata existing. If a feature is * enabled, but the feature's metadata is not on disk yet then it should be * created as needed. * * As an example, consider the com.delphix:async_destroy feature. This feature * relies on the existence of a bptree in the MOS that store blocks for * asynchronous freeing. This bptree is not created when async_destroy is * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is * called to check if async_destroy is enabled. If it is and the bptree object * does not exist yet, the bptree object is created as part of the dataset * destroy and async_destroy's reference count is incremented to indicate it * has made an on-disk format change. Later, after the destroyed dataset's * blocks have all been asynchronously freed there is no longer any use for the * bptree object, so it is destroyed and async_destroy's reference count is * decremented back to 0 to indicate that it has undone its on-disk format * changes. */ typedef enum { FEATURE_ACTION_INCR, FEATURE_ACTION_DECR, } feature_action_t; /* * Checks that the active features in the pool are supported by * this software. Adds each unsupported feature (name -> description) to * the supplied nvlist. */ boolean_t spa_features_check(spa_t *spa, boolean_t for_write, nvlist_t *unsup_feat, nvlist_t *enabled_feat) { objset_t *os = spa->spa_meta_objset; boolean_t supported; zap_cursor_t *zc; zap_attribute_t *za; uint64_t obj = for_write ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; char *buf; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); supported = B_TRUE; for (zap_cursor_init(zc, os, obj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); if (NULL != enabled_feat) { fnvlist_add_uint64(enabled_feat, za->za_name, za->za_first_integer); } if (za->za_first_integer != 0 && !zfeature_is_supported(za->za_name)) { supported = B_FALSE; if (NULL != unsup_feat) { char *desc = ""; if (zap_lookup(os, spa->spa_feat_desc_obj, za->za_name, 1, MAXPATHLEN, buf) == 0) desc = buf; VERIFY(nvlist_add_string(unsup_feat, za->za_name, desc) == 0); } } } zap_cursor_fini(zc); kmem_free(buf, MAXPATHLEN); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); return (supported); } /* * Use an in-memory cache of feature refcounts for quick retrieval. * * Note: well-designed features will not need to use this; they should * use spa_feature_is_enabled() and spa_feature_is_active() instead. * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). */ int feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { ASSERT(VALID_FEATURE_FID(feature->fi_feature)); if (spa->spa_feat_refcount_cache[feature->fi_feature] == SPA_FEATURE_DISABLED) { return (SET_ERROR(ENOTSUP)); } *res = spa->spa_feat_refcount_cache[feature->fi_feature]; return (0); } /* * Note: well-designed features will not need to use this; they should * use spa_feature_is_enabled() and spa_feature_is_active() instead. * However, this is non-static for zdb and zhack. */ int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { int err; uint64_t refcount; uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; /* * If the pool is currently being created, the feature objects may not * have been allocated yet. Act as though all features are disabled. */ if (zapobj == 0) return (SET_ERROR(ENOTSUP)); err = zap_lookup(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount); if (err != 0) { if (err == ENOENT) return (SET_ERROR(ENOTSUP)); else return (err); } *res = refcount; return (0); } static int feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { ASSERTV(uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj); ASSERT(zfeature_depends_on(feature->fi_feature, SPA_FEATURE_ENABLED_TXG)); if (!spa_feature_is_enabled(spa, feature->fi_feature)) { return (SET_ERROR(ENOTSUP)); } ASSERT(enabled_txg_obj != 0); VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, feature->fi_guid, sizeof (uint64_t), 1, res)); return (0); } /* * This function is non-static for zhack; it should otherwise not be used * outside this file. */ void feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, dmu_tx_t *tx) { uint64_t zapobj; ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount, tx)); /* * feature_sync is called directly from zhack, allowing the * creation of arbitrary features whose fi_feature field may * be greater than SPA_FEATURES. When called from zhack, the * zfeature_info_t object's fi_feature field will be set to * SPA_FEATURE_NONE. */ if (feature->fi_feature != SPA_FEATURE_NONE) { uint64_t *refcount_cache = &spa->spa_feat_refcount_cache[feature->fi_feature]; VERIFY3U(*refcount_cache, ==, atomic_swap_64(refcount_cache, refcount)); } if (refcount == 0) spa_deactivate_mos_feature(spa, feature->fi_guid); else if (feature->fi_flags & ZFEATURE_FLAG_MOS) spa_activate_mos_feature(spa, feature->fi_guid, tx); } /* * This function is non-static for zhack; it should otherwise not be used * outside this file. */ void feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) { uint64_t initial_refcount = (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; int i; ASSERT(0 != zapobj); ASSERT(zfeature_is_valid_guid(feature->fi_guid)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); /* * If the feature is already enabled, ignore the request. */ if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) return; for (i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) spa_feature_enable(spa, feature->fi_depends[i], tx); VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, feature->fi_guid, 1, strlen(feature->fi_desc) + 1, feature->fi_desc, tx)); feature_sync(spa, feature, initial_refcount, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { uint64_t enabling_txg = dmu_tx_get_txg(tx); if (spa->spa_feat_enabled_txg_obj == 0ULL) { spa->spa_feat_enabled_txg_obj = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FEATURE_ENABLED_TXG, tx); } spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); VERIFY0(zap_add(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, feature->fi_guid, sizeof (uint64_t), 1, &enabling_txg, tx)); } } -/* - * This function is non-static for zhack; it should otherwise not be used - * outside this file. - */ -void -feature_disable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) -{ - uint64_t descobj = spa->spa_feat_desc_obj; - uint64_t zapobj = feature->fi_can_readonly ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - ASSERT(0 != zapobj); - ASSERT(zfeature_is_valid_guid(feature->fi_guid)); - ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - - if (zap_contains(spa->spa_meta_objset, descobj, feature->fi_guid) == 0) - VERIFY0(zap_remove(spa->spa_meta_objset, descobj, - feature->fi_guid, tx)); - - if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) - VERIFY0(zap_remove(spa->spa_meta_objset, zapobj, - feature->fi_guid, tx)); - - spa_deactivate_mos_feature(spa, feature->fi_guid); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { - uint64_t txgobj = spa->spa_feat_enabled_txg_obj; - - if (txgobj && (zap_contains(spa->spa_meta_objset, - txgobj, feature->fi_guid) == 0)) { - spa_feature_decr(spa, SPA_FEATURE_ENABLED_TXG, tx); - VERIFY0(zap_remove(spa->spa_meta_objset, txgobj, - feature->fi_guid, tx)); - } - } -} - static void feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, dmu_tx_t *tx) { uint64_t refcount = 0; zfeature_info_t *feature = &spa_feature_table[fid]; ASSERTV(uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj); ASSERT(VALID_FEATURE_FID(fid)); ASSERT(0 != zapobj); ASSERT(zfeature_is_valid_guid(feature->fi_guid)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); switch (action) { case FEATURE_ACTION_INCR: VERIFY3U(refcount, !=, UINT64_MAX); refcount++; break; case FEATURE_ACTION_DECR: VERIFY3U(refcount, !=, 0); refcount--; break; default: ASSERT(0); break; } feature_sync(spa, feature, refcount, tx); } void spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) { /* * We create feature flags ZAP objects in two instances: during pool * creation and during pool upgrade. */ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL)); spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FEATURES_FOR_READ, tx); spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FEATURES_FOR_WRITE, tx); spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FEATURE_DESCRIPTIONS, tx); } /* * Enable any required dependencies, then enable the requested feature. */ void spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) { ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); ASSERT(VALID_FEATURE_FID(fid)); feature_enable_sync(spa, &spa_feature_table[fid], tx); } void spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) { feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx); } void spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) { feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx); } boolean_t spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) { int err; uint64_t refcount = 0; ASSERT(VALID_FEATURE_FID(fid)); if (spa_version(spa) < SPA_VERSION_FEATURES) return (B_FALSE); err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); ASSERT(err == 0 || err == ENOTSUP); return (err == 0); } boolean_t spa_feature_is_active(spa_t *spa, spa_feature_t fid) { int err; uint64_t refcount = 0; ASSERT(VALID_FEATURE_FID(fid)); if (spa_version(spa) < SPA_VERSION_FEATURES) return (B_FALSE); err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); ASSERT(err == 0 || err == ENOTSUP); return (err == 0 && refcount > 0); } /* * For the feature specified by fid (which must depend on * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the * OUT txg argument. * * Returns B_TRUE if the feature is enabled, in which case txg will be filled * with the transaction group in which the specified feature was enabled. * Returns B_FALSE otherwise (i.e. if the feature is not enabled). */ boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) { int err; ASSERT(VALID_FEATURE_FID(fid)); if (spa_version(spa) < SPA_VERSION_FEATURES) return (B_FALSE); err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); ASSERT(err == 0 || err == ENOTSUP); return (err == 0); }