diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 29252e6a24f4..e89eb3bea770 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -1,10564 +1,10566 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-f] [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); } abort(); /* NOTREACHED */ } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(5).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *fname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * compatibility property and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_COMPATIBILITY && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, fname))) { (void) fprintf(stderr, gettext("'compatibility' and " "'version' properties cannot be specified " "together\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finaly the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; struct stat st; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(vdev, argv[0], sizeof (vdev)); if (vdev[0] != '/' && stat(vdev, &st) != 0) { int error; error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat(vdev, &st) != 0)) { (void) fprintf(stderr, gettext( "failed to find device %s, try specifying absolute " "path instead\n"), argv[0]); return (1); } } if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("failed to open %s: %s\n"), vdev, strerror(errno)); return (1); } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\"\n"), vdev, zpool_pool_state_to_name(state), name); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_shareall(pool); zfs_commit_all_shares(); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { char *name; nvlist_t **child; uint_t c, children; int ret; name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static int is_blank_str(char *str) { while (str != NULL && *str != '\0') { if (!isblank(*str)) return (0); str++; } return (1); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) { vdev_cmd_data_t *data; int i, j; char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) printf(" "); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) printf(" "); val = data->lines[j]; printf("%s", val ? val : ""); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; char *type; char *path = NULL; char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (cb->cb_literal) { printf(" "); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); printf(" "); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); printf(" "); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); printf(" "); printf_color(rcolor, "%5s", rbuf); printf(" "); printf_color(wcolor, "%5s", wbuf); printf(" "); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); else printf(" %5s", rbuf); } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static void show_import(nvlist_t *config) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; uint64_t hostid = 0; char *msgid; char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices contains" " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric\n\tidentifier, " "though the file(s) indicated by its " "'compatibility'\n\tproperty cannot be parsed at " "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted snapshots and bookmarks contain " "an on-disk\n\tincompatibility. This may " "cause on-disk corruption if they are used" "\n\twith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n\t" "No additional action is needed if there " "are no encrypted snapshots or\n\t" "bookmarks. If preserving the encrypted " "snapshots and bookmarks is\n\trequired, " "use a non-raw send to backup and restore " "them. Alternately,\n\tthey may be removed" " to resolve the incompatibility.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%lx)\n\tbefore it " "can be safely imported.\n"), hostname, (unsigned long) hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) { (void) printf(gettext( " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; zpool_handle_t *zhp; char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%lx)\nExport the pool on the other system, " "then run 'zpool import'.\n"), name, hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { char *hostname = ""; uint64_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%lx) at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, (unsigned long)hostid, ctime((time_t *)×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS) { ret = zfs_crypto_attempt_load_keys(g_zfs, name); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (ret); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] * * -c Read pool information from a cachefile instead of searching * devices. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); if (searchdirs != NULL) free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } found_config = NULL; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir; envdup = strdup(env); dir = strtok(envdup, ":"); while (dir != NULL) { if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = dir; dir = strtok(NULL, ":"); } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ err = 0; elem = NULL; first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, policy) == 0); if (argc == 0) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { show_import(config); } } else if (searchname != NULL) { char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, searchname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), searchname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == searchguid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (argc != 0 && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : argv[1], mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (argc == 0 && first) (void) fprintf(stderr, gettext("no pools available to import\n")); error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; char **cb_vdev_names; /* Only show these vdevs */ unsigned int cb_vdev_names_count; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else printf(" %*s", column_size, buf); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (i = 0; i < ARRAY_SIZE(names); i++) { val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdev_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdev_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, oldnv, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); if (strcmp(name, cb->cb_vdev_names[0]) == 0) ret = 1; /* match */ free(name); return (ret); } /* * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_vdev_names for a second... */ tmp_name = cb->cb_vdev_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_vdev_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_vdev_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void fsleep(float sec) { struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, "-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { sprintf(fullpath, "%s/%s", dirpath, ent->d_name); /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(char *subcommand) { char *dir, *sp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; dir = strtok(sp, ":"); while (dir != NULL) { print_zpool_dir_scripts(dir); dir = strtok(NULL, ":"); } free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { /* All the args are vdevs */ cb.cb_vdev_names = argv; cb.cb_vdev_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { /* ...and the rest are vdev names */ cb.cb_vdev_names = argv + 1; cb.cb_vdev_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdev_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdev_names, cb.cb_vdev_names_count, cb.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdev_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; snprintf(propval, sizeof (propval), "%-*s", (int)width, str); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { char *bias = NULL; char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE); free(vname); } } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; print_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (fault) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "spw")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (cb.cb_type == POOL_SCAN_NONE && cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { (void) fprintf(stderr, gettext("invalid option combination: " "-s and -p are mutually exclusive\n")); usage(B_FALSE); } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { (void) fprintf(stderr, gettext("invalid value for rate\n")); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total, total_buf, sizeof (total_buf)); zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); /* do not print estimated time if we have a paused scrub */ if (pause == 0) { (void) printf(gettext("\t%s scanned at %s/s, " "%s issued at %s/s, %s total\n"), scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est = vrs->vrs_bytes_est; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / MAX(scan_rate, 1), time_buf); (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " "%s total\n"), bytes_scanned_buf, scan_rate_buf, bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (scan_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub) print_scan_scrub_resilver_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext(" %s copied out of %s at %s/s, " "%.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext(" %s memory used for " "removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); printf(" "); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(5) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: (void) printf(gettext("status: One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); (void) printf(gettext("action: Replace affected devices with " "devices that support the\n\tconfigured block size, or " "migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("\tExisting encrypted snapshots " "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t printnl = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); printnl = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; printnl = B_TRUE; } } if (printnl) { (void) printf(gettext("\n")); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(5) for " "details.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { printnl = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { printnl = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (printnl) { (void) printf(gettext("\n")); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; + if (!fi->fi_zfs_mod_supported) + continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32], *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(ev_opts_t *opts) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(&opts); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) printf("\n"); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); else zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c; char *value; int i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': { static char *col_subopts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", NULL }; /* Reset activities array */ bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); while (*optarg != '\0') { int activity = getsubopt(&optarg, col_subopts, &value); if (activity < 0) { (void) fprintf(stderr, gettext("invalid activity '%s'\n"), value); usage(B_FALSE); } wd.wd_enabled[activity] = B_TRUE; } break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { if (zfs_version_print() == -1) return (1); return (0); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char badword[ZFS_MAXPROPLEN]; char badfile[MAXPATHLEN]; zpool_compat_status_t ret; switch (ret = zpool_load_compat(compat, list, badword, badfile)) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_READERR: (void) fprintf(stderr, gettext("error reading compatibility " "file '%s'\n"), badfile); break; case ZPOOL_COMPATIBILITY_BADFILE: (void) fprintf(stderr, gettext("compatibility file '%s' " "too large or not newline-terminated\n"), badfile); break; case ZPOOL_COMPATIBILITY_BADWORD: (void) fprintf(stderr, gettext("unknown feature '%s' in " "compatibility file '%s'\n"), badword, badfile); break; case ZPOOL_COMPATIBILITY_NOFILES: (void) fprintf(stderr, gettext("no compatibility files " "specified\n")); break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c index cfa1290d78d1..1a030280704a 100644 --- a/sys/contrib/openzfs/cmd/ztest/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest/ztest.c @@ -1,8060 +1,8063 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. * * The overall design of the ztest program is as follows: * * (1) For each major functional area (e.g. adding vdevs to a pool, * creating and destroying datasets, reading and writing objects, etc) * we have a simple routine to test that functionality. These * individual routines do not have to do anything "stressful". * * (2) We turn these simple functionality tests into a stress test by * running them all in parallel, with as many threads as desired, * and spread across as many datasets, objects, and vdevs as desired. * * (3) While all this is happening, we inject faults into the pool to * verify that self-healing data really works. * * (4) Every time we open a dataset, we change its checksum and compression * functions. Thus even individual objects vary from block to block * in which checksum they use and whether they're compressed. * * (5) To verify that we never lose on-disk consistency after a crash, * we run the entire test in a child of the main process. * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number * as part of their data. When reading old data, they verify that * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * * (7) Threads are created with a reduced stack size, for sanity checking. * Therefore, it's important not to allocate huge buffers on the stack. * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. * * To turn this into an overnight stress test, use -T to specify run time. * * You can ask more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. * * When ztest invokes itself it passes all relevant information through a * temporary file which is mmap-ed in the child process. This allows shared * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always * stored at offset 0 of this file and contains information on the size and * number of shared structures in the file. The information stored in this file * must remain backwards compatible with older versions of ztest so that * ztest can invoke them during backwards compatibility testing (-B). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif static int ztest_fd_data = -1; static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; uint64_t zh_opts_size; uint64_t zh_size; uint64_t zh_stats_size; uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; enum ztest_class_state { ZTEST_VDEV_CLASS_OFF, ZTEST_VDEV_CLASS_ON, ZTEST_VDEV_CLASS_RND }; #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; char zo_alt_ztest[MAXNAMELEN]; char zo_alt_libpath[MAXNAMELEN]; uint64_t zo_vdevs; uint64_t zo_vdevtime; size_t zo_vdev_size; int zo_ashift; int zo_mirrors; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; int zo_draid_data; int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; uint64_t zo_killrate; int zo_verbose; int zo_init; uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; int zo_gvars_count; char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { .zo_pool = "ztest", .zo_dir = "/tmp", .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, .zo_raid_children = 4, .zo_raid_parity = 1, .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ .zo_draid_data = 4, /* data drives */ .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ .zo_killrate = 70, /* 70% kill rate */ .zo_verbose = 0, .zo_mmp_test = 0, .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = 64 << 10, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, }; extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern unsigned long zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; typedef struct ztest_shared_ds { uint64_t zd_seq; } ztest_shared_ds_t; static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, ZTEST_IO_WRITE_PATTERN, ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; uint64_t bt_crtxg; } ztest_block_tag_t; typedef struct bufwad { uint64_t bw_index; uint64_t bw_txg; uint64_t bw_data; } bufwad_t; /* * It would be better to use a rangelock_t per object. Unfortunately * the rangelock_t is not a drop-in replacement for rl_t, because we * still need to map from object ID to rangelock_t. */ typedef enum { RL_READER, RL_WRITER, RL_APPEND } rl_type_t; typedef struct rll { void *rll_writer; int rll_readers; kmutex_t rll_lock; kcondvar_t rll_cv; } rll_t; typedef struct rl { uint64_t rl_object; uint64_t rl_offset; uint64_t rl_size; rll_t *rl_lock; } rl_t; #define ZTEST_RANGE_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64 /* * Object descriptor. Used as a template for object lookup/create/remove. */ typedef struct ztest_od { uint64_t od_dir; uint64_t od_object; dmu_object_type_t od_type; dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { ztest_shared_ds_t *zd_shared; objset_t *zd_os; pthread_rwlock_t zd_zilog_lock; zilog_t *zd_zilog; ztest_od_t *zd_od; /* debugging aid */ char zd_name[ZFS_MAX_DATASET_NAME_LEN]; kmutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; } ztest_ds_t; /* * Per-iteration state. */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ const char *zi_funcname; /* name of test function */ } ztest_info_t; typedef struct ztest_shared_callstate { uint64_t zc_count; /* per-pass count */ uint64_t zc_time; /* per-pass time */ uint64_t zc_next; /* next time to call this function */ } ztest_shared_callstate_t; static ztest_shared_callstate_t *ztest_shared_callstate; #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ #define ZTI_INIT(func, iters, interval) \ { .zi_func = (func), \ .zi_iters = (iters), \ .zi_interval = (interval), \ .zi_funcname = # func } ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), ZTI_INIT(ztest_zap, 30, &zopt_always), ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), ZTI_INIT(ztest_split_pool, 1, &zopt_always), ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), #if 0 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), #endif ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), ZTI_INIT(ztest_reguid, 1, &zopt_rarely), ZTI_INIT(ztest_scrub, 1, &zopt_rarely), ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. */ typedef struct ztest_cb_list { kmutex_t zcl_callbacks_lock; list_t zcl_callbacks; } ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; hrtime_t zs_thread_stop; hrtime_t zs_thread_kill; uint64_t zs_enospc_count; uint64_t zs_vdev_next_leaf; uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by * the individual tests. To modify the namespace, consumers must grab * this lock as writer. Grabbing the lock as reader will ensure that the * namespace does not change while the lock is held. */ static pthread_rwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; /* Commit cb delay */ static uint64_t zc_min_txg_delay = UINT64_MAX; static int zc_cb_counter = 0; /* * Minimum number of commit callbacks that need to be registered for us to check * whether the minimum txg delay is acceptable. */ #define ZTEST_COMMIT_CB_MIN_REG 100 /* * If a number of txgs equal to this threshold have been created after a commit * callback has been registered but not called, then we assume there is an * implementation bug. */ #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, ZTEST_OBJECTS }; static void usage(boolean_t) __NORETURN; static int ztest_scrub_impl(spa_t *spa); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!ztest_opts.zo_dump_dbgmsg) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDOUT_FILENO, "\n", 1); zfs_dbgmsg_print("ztest"); } #define BACKTRACE_SZ 100 static void sig_handler(int signo) { struct sigaction action; #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ int nptrs; void *buffer[BACKTRACE_SZ]; nptrs = backtrace(buffer, BACKTRACE_SZ); backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); #endif dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } #define FATAL_MSG_SZ 1024 char *fatal_msg; static void fatal(int do_perror, char *message, ...) { va_list args; int save_errno = errno; char *buf; (void) fflush(stdout); buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); va_start(args, message); (void) sprintf(buf, "ztest: "); /* LINTED */ (void) vsprintf(buf + strlen(buf), message, args); va_end(args); if (do_perror) { (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ if (ztest_dump_core) abort(); else dump_debug_buffer(); exit(3); } static int str2shift(const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { return (10*i); } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); /* NOTREACHED */ } static uint64_t nicenumtoull(const char *buf) { char *end; uint64_t val; val = strtoull(buf, &end, 0); if (end == buf) { (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); usage(B_FALSE); } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val = (uint64_t)fval; } else { int shift = str2shift(end); if (shift >= 64 || (val << shift) >> shift != val) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val <<= shift; } return (val); } static void usage(boolean_t requested) { const ztest_shared_opts_t *zo = &ztest_opts_defaults; char nice_vdev_size[NN_NUMBUF_SZ]; char nice_force_ganging[NN_NUMBUF_SZ]; FILE *fp = requested ? stdout : stderr; nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, sizeof (nice_force_ganging)); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks / draid_disks (default: %d)]\n" "\t[-R raid_parity (default: %d)]\n" "\t[-K raid_kind (default: random)] raidz|draid|random\n" "\t[-D draid_data (default: %d)] in config\n" "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" "\t[-i init_count (default: %d)] initialize pool i times\n" "\t[-k kill_percentage (default: %llu%%)]\n" "\t[-p pool_name (default: %s)]\n" "\t[-f dir (default: %s)] file directory for vdev files\n" "\t[-M] Multi-host simulate pool imported on remote host\n" "\t[-V] verbose (use multiple times for ever more blather)\n" "\t[-E] use existing pool instead of creating new one\n" "\t[-T time (default: %llu sec)] total run time\n" "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-C vdev class state (default: random)] special=on|off|random\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" "\t[-h] (print help)\n" "", zo->zo_pool, (u_longlong_t)zo->zo_vdevs, /* -v */ nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ zo->zo_raid_children, /* -r */ zo->zo_raid_parity, /* -R */ zo->zo_draid_data, /* -D */ zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ zo->zo_init, /* -i */ (u_longlong_t)zo->zo_killrate, /* -k */ zo->zo_pool, /* -p */ zo->zo_dir, /* -f */ (u_longlong_t)zo->zo_time, /* -T */ (u_longlong_t)zo->zo_maxloops, /* -F */ (u_longlong_t)zo->zo_passtime); exit(requested ? 0 : 1); } static uint64_t ztest_random(uint64_t range) { uint64_t r; ASSERT3S(ztest_fd_rand, >=, 0); if (range == 0) return (0); if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); } static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) { char name[32]; char *value; int state = ZTEST_VDEV_CLASS_RND; (void) strlcpy(name, input, sizeof (name)); value = strchr(name, '='); if (value == NULL) { (void) fprintf(stderr, "missing value in property=value " "'-C' argument (%s)\n", input); usage(B_FALSE); } *(value) = '\0'; value++; if (strcmp(value, "on") == 0) { state = ZTEST_VDEV_CLASS_ON; } else if (strcmp(value, "off") == 0) { state = ZTEST_VDEV_CLASS_OFF; } else if (strcmp(value, "random") == 0) { state = ZTEST_VDEV_CLASS_RND; } else { (void) fprintf(stderr, "invalid property value '%s'\n", value); usage(B_FALSE); } if (strcmp(name, "special") == 0) { zo->zo_special_vdevs = state; } else { (void) fprintf(stderr, "invalid property name '%s'\n", name); usage(B_FALSE); } if (zo->zo_verbose >= 3) (void) printf("%s vdev state is '%s'\n", name, value); } static void process_options(int argc, char **argv) { char *path; ztest_shared_opts_t *zo = &ztest_opts; int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': case 's': case 'a': case 'm': case 'r': case 'R': case 'D': case 'S': case 'd': case 't': case 'g': case 'i': case 'k': case 'T': case 'P': case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': zo->zo_vdevs = value; break; case 's': zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': zo->zo_ashift = value; break; case 'm': zo->zo_mirrors = value; break; case 'r': zo->zo_raid_children = MAX(1, value); break; case 'R': zo->zo_raid_parity = MIN(MAX(value, 1), 3); break; case 'K': (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); break; case 'D': zo->zo_draid_data = MAX(1, value); break; case 'S': zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); break; case 't': zo->zo_threads = MAX(1, value); break; case 'g': zo->zo_metaslab_force_ganging = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; break; case 'k': zo->zo_killrate = value; break; case 'p': (void) strlcpy(zo->zo_pool, optarg, sizeof (zo->zo_pool)); break; case 'f': path = realpath(optarg, NULL); if (path == NULL) { (void) fprintf(stderr, "error: %s: %s\n", optarg, strerror(errno)); usage(B_FALSE); } else { (void) strlcpy(zo->zo_dir, path, sizeof (zo->zo_dir)); free(path); } break; case 'M': zo->zo_mmp_test = 1; break; case 'V': zo->zo_verbose++; break; case 'E': zo->zo_init = 0; break; case 'T': zo->zo_time = value; break; case 'P': zo->zo_passtime = MAX(1, value); break; case 'F': zo->zo_maxloops = MAX(1, value); break; case 'B': (void) strlcpy(altdir, optarg, sizeof (altdir)); break; case 'C': ztest_parse_name_value(optarg, zo); break; case 'o': if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { (void) fprintf(stderr, "max global var count (%zu) exceeded\n", ZO_GVARS_MAX_COUNT); usage(B_FALSE); } char *v = zo->zo_gvars[zo->zo_gvars_count]; if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= ZO_GVARS_MAX_ARGLEN) { (void) fprintf(stderr, "global var option '%s' is too long\n", optarg); usage(B_FALSE); } zo->zo_gvars_count++; break; case 'G': zo->zo_dump_dbgmsg = 1; break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } /* When raid choice is 'random' add a draid pool 50% of the time */ if (strcmp(raid_kind, "random") == 0) { (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? "draid" : "raidz", sizeof (raid_kind)); if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); } if (strcmp(raid_kind, "draid") == 0) { uint64_t min_devsize; /* With fewer disk use 256M, otherwise 128M is OK */ min_devsize = (ztest_opts.zo_raid_children < 16) ? (256ULL << 20) : (128ULL << 20); /* No top-level mirrors with dRAID for now */ zo->zo_mirrors = 0; /* Use more appropriate defaults for dRAID */ if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) zo->zo_vdevs = 1; if (zo->zo_raid_children == ztest_opts_defaults.zo_raid_children) zo->zo_raid_children = 16; if (zo->zo_ashift < 12) zo->zo_ashift = 12; if (zo->zo_vdev_size < min_devsize) zo->zo_vdev_size = min_devsize; if (zo->zo_draid_data + zo->zo_raid_parity > zo->zo_raid_children - zo->zo_draid_spares) { (void) fprintf(stderr, "error: too few draid " "children (%d) for stripe width (%d)\n", zo->zo_raid_children, zo->zo_draid_data + zo->zo_raid_parity); usage(B_FALSE); } (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); if (strlen(altdir) > 0) { char *cmd; char *realaltdir; char *bin; char *ztest; char *isa; int isalen; cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY3P(NULL, !=, realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", altdir); } VERIFY3P(NULL, !=, realpath(altdir, realaltdir)); /* * 'cmd' should be of the form "/usr/bin//ztest". * We want to extract to determine if we should use * 32 or 64 bit binaries. */ bin = strstr(cmd, "/usr/bin/"); ztest = strstr(bin, "/ztest"); isa = bin + 9; isalen = ztest - isa; (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), "%s/usr/lib/%.*s", realaltdir, isalen, isa); if (0 != access(zo->zo_alt_ztest, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest: %s", zo->zo_alt_ztest); } else if (0 != access(zo->zo_alt_libpath, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate lib directory %s", zo->zo_alt_libpath); } umem_free(cmd, MAXPATHLEN); umem_free(realaltdir, MAXPATHLEN); } } static void ztest_kill(ztest_shared_t *zs) { zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); /* * Before we kill off ztest, make sure that the config is updated. * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); (void) kill(getpid(), SIGKILL); } /* ARGSUSED */ static void ztest_record_enospc(const char *s) { ztest_shared->zs_enospc_count++; } static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } static boolean_t ztest_is_draid_spare(const char *name) { uint64_t spare_id = 0, parity = 0, vdev_id = 0; if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, (u_longlong_t *)&spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ashift == 0) ashift = ztest_get_ashift(); if (path == NULL) { path = pathbuf; if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(path, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } else { draid_spare = ztest_is_draid_spare(path); } if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); if (ftruncate(fd, size) != 0) fatal(1, "can't ftruncate %s", path); (void) close(fd); } file = fnvlist_alloc(); fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { nvlist_t *raid, **child; int c; if (r < 2) return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); raid = fnvlist_alloc(); fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, ztest_opts.zo_raid_type); fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, ztest_opts.zo_raid_parity); fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, child, r); if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { uint64_t ndata = ztest_opts.zo_draid_data; uint64_t nparity = ztest_opts.zo_raid_parity; uint64_t nspares = ztest_opts.zo_draid_spares; uint64_t children = ztest_opts.zo_raid_children; uint64_t ngroups = 1; /* * Calculate the minimum number of groups required to fill a * slice. This is the LCM of the stripe width (data + parity) * and the number of data drives (children - spares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); } for (c = 0; c < r; c++) fnvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); return (raid); } static nvlist_t * make_vdev_mirror(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); mirror = fnvlist_alloc(); fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, child, m); for (c = 0; c < m; c++) fnvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); return (mirror); } static nvlist_t * make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; boolean_t log; ASSERT3S(t, >, 0); log = (class != NULL && strcmp(class, "log") == 0); child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); if (class != NULL && class[0] != '\0') { ASSERT(m > 1 || log); /* expecting a mirror */ fnvlist_add_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, class); } } root = fnvlist_alloc(); fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, child, t); for (c = 0; c < t; c++) fnvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); return (root); } /* * Find a random spa version. Returns back a random spa version in the * range [initial_version, SPA_VERSION_FEATURES]. */ static uint64_t ztest_random_spa_version(uint64_t initial_version) { uint64_t version = initial_version; if (version <= SPA_VERSION_BEFORE_FEATURES) { version = version + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); } if (version > SPA_VERSION_BEFORE_FEATURES) version = SPA_VERSION_FEATURES; ASSERT(SPA_VERSION_IS_SUPPORTED(version)); return (version); } static int ztest_random_blocksize(void) { ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. */ int maxbs = SPA_OLD_MAXBLOCKSHIFT; if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) maxbs = 20; uint64_t block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int ztest_random_dnodesize(void) { int slots; int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; if (max_slots == DNODE_MIN_SLOTS) return (DNODE_MIN_SIZE); /* * Weight the random distribution more heavily toward smaller * dnode sizes since that is more likely to reflect real-world * usage. */ ASSERT3U(max_slots, >, 4); switch (ztest_random(10)) { case 0: slots = 5 + ztest_random(max_slots - 4); break; case 1 ... 4: slots = 2 + ztest_random(3); break; default: slots = 1; break; } return (slots << DNODE_SHIFT); } static int ztest_random_ibshift(void) { return (DN_MIN_INDBLKSHIFT + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } static uint64_t ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { uint64_t top; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); } static uint64_t ztest_random_dsl_prop(zfs_prop_t prop) { uint64_t value; do { value = zfs_prop_random_value(prop, ztest_random(-1ULL)); } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); return (value); } static int ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, boolean_t inherit) { const char *propname = zfs_prop_to_name(prop); const char *valname; char *setpoint; uint64_t curval; int error; error = dsl_prop_set_int(osname, propname, (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); if (ztest_opts.zo_verbose >= 6) { int err; err = zfs_prop_index_to_string(prop, curval, &valname); if (err) (void) printf("%s %s = %llu at '%s'\n", osname, propname, (unsigned long long)curval, setpoint); else (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); } umem_free(setpoint, MAXPATHLEN); return (error); } static int ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); error = spa_prop_set(spa, props); fnvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); return (error); } static int ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { int err; char *cp = NULL; char ddname[ZFS_MAX_DATASET_NAME_LEN]; strcpy(ddname, name); cp = strchr(ddname, '@'); if (cp != NULL) *cp = '\0'; err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); while (decrypt && err == EACCES) { dsl_crypto_params_t *dcp; nvlist_t *crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); /* * Note: if there was an error loading, the wkey was not * consumed, and needs to be freed. */ dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { /* * We couldn't load a key for this dataset so try * the parent. This loop will eventually hit the * encryption root since ztest only makes clones * as children of their origin datasets. */ cp = strrchr(ddname, '/'); if (cp == NULL) return (err); *cp = '\0'; err = EACCES; continue; } else if (err != 0) { break; } err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); break; } return (err); } static void ztest_rll_init(rll_t *rll) { rll->rll_writer = NULL; rll->rll_readers = 0; mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); } static void ztest_rll_destroy(rll_t *rll) { ASSERT3P(rll->rll_writer, ==, NULL); ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } static void ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); if (type == RL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; } else { while (rll->rll_writer != NULL || rll->rll_readers) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_writer = curthread; } mutex_exit(&rll->rll_lock); } static void ztest_rll_unlock(rll_t *rll) { mutex_enter(&rll->rll_lock); if (rll->rll_writer) { ASSERT0(rll->rll_readers); rll->rll_writer = NULL; } else { ASSERT3S(rll->rll_readers, >, 0); ASSERT3P(rll->rll_writer, ==, NULL); rll->rll_readers--; } if (rll->rll_writer == NULL && rll->rll_readers == 0) cv_broadcast(&rll->rll_cv); mutex_exit(&rll->rll_lock); } static void ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_lock(rll, type); } static void ztest_object_unlock(ztest_ds_t *zd, uint64_t object) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_unlock(rll); } static rl_t * ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; rl->rl_offset = offset; rl->rl_size = size; rl->rl_lock = rll; ztest_rll_lock(rll, type); return (rl); } static void ztest_range_unlock(rl_t *rl) { rll_t *rll = rl->rl_lock; ztest_rll_unlock(rll); umem_free(rl, sizeof (*rl)); } static void ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); int l; if (zd->zd_shared != NULL) zd->zd_shared->zd_seq = 0; VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_init(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_init(&zd->zd_range_lock[l]); } static void ztest_zd_fini(ztest_ds_t *zd) { int l; mutex_destroy(&zd->zd_dirobj_lock); (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_destroy(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_destroy(&zd->zd_range_lock[l]); } #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) static uint64_t ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) { uint64_t txg; int error; /* * Attempt to assign tx to some transaction group. */ error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { ASSERT3U(txg_how, ==, TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); ztest_record_enospc(tag); } dmu_tx_abort(tx); return (0); } txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); return (txg); } static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; bt->bt_crtxg = crtxg; } static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * ztest_bt_bonus(dmu_buf_t *db) { dmu_object_info_t doi; ztest_block_tag_t *bt; dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); return (bt); } /* * Generate a token to fill up unused bonus buffer space. Try to make * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* * Fill up the unused bonus buffer region before the block tag with a * verifiable pattern. Filling the whole bonus area with non-zero data * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ static void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); *bonusp = token; } } /* * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ static void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); VERIFY3U(*bonusp, ==, token); } } /* * ZIL logging ops */ #define lrz_type lr_mode #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev #define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); itx->itx_oid = object; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) return; if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } /* * ZIL replay ops */ static int ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_create_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; ztest_block_tag_t *bbt; dmu_buf_t *db; dmu_tx_t *tx; uint64_t txg; int error = 0; int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } else { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); } txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) return (ENOSPC); ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } if (error) { ASSERT3U(error, ==, EEXIST); ASSERT(zd->zd_zilog->zl_replay); dmu_tx_commit(tx); return (error); } ASSERT3U(lr->lr_foid, !=, 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, lr->lr_gen, txg, txg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); dmu_tx_commit(tx); return (0); } static int ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_remove_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object, txg; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); VERIFY0( zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); ztest_object_lock(zd, object, RL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_object_unlock(zd, object); return (ENOSPC); } if (doi.doi_type == DMU_OT_ZAP_OTHER) { VERIFY0(zap_destroy(os, object, tx)); } else { VERIFY0(dmu_object_free(os, object, tx)); } VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); ztest_object_unlock(zd, object); return (0); } static int ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_write_t *lr = arg2; objset_t *os = zd->zd_os; void *data = lr + 1; /* data follows lr */ uint64_t offset, length; ztest_block_tag_t *bt = data; ztest_block_tag_t *bbt; uint64_t gen, txg, lrtxg, crtxg; dmu_object_info_t doi; dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } if (bt->bt_magic == BSWAP_64(BT_MAGIC)) byteswap_uint64_array(bt, sizeof (*bt)); if (bt->bt_magic != BT_MAGIC) bt = NULL; ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); gen = bbt->bt_gen; crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, lr->lr_foid, offset, length); if (ztest_random(8) == 0 && length == doi.doi_data_block_size && P2PHASE(offset, length) == 0) abuf = dmu_request_arcbuf(db, length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } if (bt != NULL) { /* * Usually, verify the old data before writing new data -- * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } /* * Writes can appear to be newer than the bonus buffer because * the ztest_get_data() callback does a dmu_read() of the * open-context data, which may be different than the data * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } /* * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { bcopy(data, abuf->b_data, length); dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); } (void) ztest_log_write(zd, tx, lr); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_truncate_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx)); (void) ztest_log_truncate(zd, tx, lr); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_setattr_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT3U(lr->lr_size, !=, 0); ASSERT3U(lr->lr_mode, !=, 0); ASSERT3U(lrtxg, !=, 0); } else { /* * Randomly change the size and increment the generation. */ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; ASSERT0(lrtxg); } /* * Verify that the current bonus buffer is not newer than our txg. */ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); dmu_tx_commit(tx); ztest_object_unlock(zd, lr->lr_foid); return (0); } zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* 0 no such transaction type */ ztest_replay_create, /* TX_CREATE */ NULL, /* TX_MKDIR */ NULL, /* TX_MKXATTR */ NULL, /* TX_SYMLINK */ ztest_replay_remove, /* TX_REMOVE */ NULL, /* TX_RMDIR */ NULL, /* TX_LINK */ NULL, /* TX_RENAME */ ztest_replay_write, /* TX_WRITE */ ztest_replay_truncate, /* TX_TRUNCATE */ ztest_replay_setattr, /* TX_SETATTR */ NULL, /* TX_ACL */ NULL, /* TX_CREATE_ACL */ NULL, /* TX_CREATE_ATTR */ NULL, /* TX_CREATE_ACL_ATTR */ NULL, /* TX_MKDIR_ACL */ NULL, /* TX_MKDIR_ATTR */ NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ }; /* * ZIL get_data callbacks */ /* ARGSUSED */ static void ztest_get_done(zgd_t *zgd, int error) { ztest_ds_t *zd = zgd->zgd_private; uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); } static int ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; uint64_t txg = lr->lr_common.lrc_txg; uint64_t crtxg; dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); return (error); } crtxg = ztest_bt_bonus(db)->bt_crtxg; if (crtxg == 0 || crtxg > txg) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, object); return (ENOENT); } dmu_object_info_from_db(db, &doi); dmu_buf_rele(db, FTAG); db = NULL; zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_lwb = lwb; zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); } else { ASSERT3U(offset, <, size); offset = 0; } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT3U(db->db_offset, ==, offset); ASSERT3U(db->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); if (error == 0) return (0); } } ztest_get_done(zgd, error); return (error); } static void * ztest_lr_alloc(size_t lrsize, char *name) { char *lr; size_t namesize = name ? strlen(name) + 1 : 0; lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); if (name) bcopy(name, lr + lrsize, namesize); return (lr); } static void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; umem_free(lr, lrsize + namesize); } /* * Lookup a bunch of objects. Returns the number of objects not found. */ static int ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { od->od_object = 0; error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { ASSERT3S(error, ==, ENOENT); ASSERT0(od->od_object); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, RL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); od->od_type = doi.doi_type; od->od_blocksize = doi.doi_data_block_size; od->od_gen = bbt->bt_gen; dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, od->od_object); } } return (missing); } static int ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { if (missing) { od->od_object = 0; missing++; continue; } lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ lr->lrz_type = od->od_crtype; lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { ASSERT0(missing); od->od_object = 0; missing++; } else { od->od_object = lr->lr_foid; od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; ASSERT3U(od->od_object, !=, 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); od += count - 1; for (i = count - 1; i >= 0; i--, od--) { if (missing) { missing++; continue; } /* * No object was found. */ if (od->od_object == 0) continue; lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { ASSERT3U(error, ==, ENOSPC); missing++; } else { od->od_object = 0; } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, void *data) { lr_write_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); bcopy(data, lr + 1, size); error = ztest_replay_write(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr) + size, NULL); return (error); } static int ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { lr_truncate_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; error = ztest_replay_truncate(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static int ztest_setattr(ztest_ds_t *zd, uint64_t object) { lr_setattr_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_size = 0; lr->lr_mode = 0; error = ztest_replay_setattr(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static void ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, RL_READER); rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { (void) dmu_free_long_range(os, object, offset, size); } ztest_range_unlock(rl); ztest_object_unlock(zd, object); } static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; uint64_t blocksize; void *data; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); /* * Pick an i/o type at random, biased toward writing block tags. */ io_type = ztest_random(ZTEST_IO_TYPES); if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); switch (io_type) { case ZTEST_IO_WRITE_TAG: ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; case ZTEST_IO_WRITE_PATTERN: (void) memset(data, 'a' + (object + offset) % 5, blocksize); if (ztest_random(2) == 0) { /* * Induce fletcher2 collisions to ensure that * zio_ddt_collision() detects and resolves them * when using fletcher2-verify for deduplication. */ ((uint64_t *)data)[0] ^= 1ULL << 63; ((uint64_t *)data)[4] ^= 1ULL << 63; } (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_WRITE_ZEROES: bzero(data, blocksize); (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_TRUNCATE: (void) ztest_truncate(zd, object, offset, blocksize); break; case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; default: break; case ZTEST_IO_REWRITE: (void) pthread_rwlock_rdlock(&ztest_name_lock); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), B_FALSE); VERIFY(err == 0 || err == ENOSPC); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COMPRESSION, ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), B_FALSE); VERIFY(err == 0 || err == ENOSPC); (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, DMU_READ_NO_PREFETCH)); (void) ztest_write(zd, object, offset, blocksize, data); break; } (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); umem_free(data, blocksize); } /* * Initialize an object description template. */ static void ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; od->od_blocksize = 0; od->od_gen = 0; (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", tag, (longlong_t)id, (u_longlong_t)index); } /* * Lookup or create the objects for a test using the od template. * If the objects do not all exist, or if 'remove' is specified, * remove any existing objects and create new ones. Otherwise, * use the existing objects. */ static int ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) { int count = size / sizeof (*od); int rv = 0; mutex_enter(&zd->zd_dirobj_lock); if ((ztest_lookup(zd, od, count) != 0 || remove) && (ztest_remove(zd, od, count) != 0 || ztest_create(zd, od, count) != 0)) rv = -1; zd->zd_od = od; mutex_exit(&zd->zd_dirobj_lock); return (rv); } /* ARGSUSED */ void ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { zilog_t *zilog = zd->zd_zilog; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child * shared memory. If we die, the next iteration of ztest_run() * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); ASSERT3P(zd->zd_shared, !=, NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); } /* * This function is designed to simulate the operations that occur during a * mount/unmount operation. We hold the dataset across these operations in an * attempt to expose any implicit assumptions about ZIL management. */ /* ARGSUSED */ void ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; /* * We hold the ztest_vdev_lock so we don't cause problems with * other threads that wish to remove a log device, such as * ztest_device_removal(). */ mutex_enter(&ztest_vdev_lock); /* * We grab the zd_dirobj_lock to ensure that no other thread is * updating the zil (i.e. adding in-memory log records) and the * zd_zilog_lock to block any I/O. */ mutex_enter(&zd->zd_dirobj_lock); (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); /* zfsvfs_teardown() */ zil_close(zd->zd_zilog); /* zfsvfs_setup() */ VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); mutex_exit(&zd->zd_dirobj_lock); mutex_exit(&ztest_vdev_lock); } /* * Verify that we can't destroy an active pool, create an existing pool, * or create a pool with a bad vdev spec. */ /* ARGSUSED */ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; if (zo->zo_mmp_test) return; /* * Attempt to create using a bad file. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * We open a reference to the spa and then we try to export it * expecting one of the following errors: * * EBUSY * Because of the reference we just opened. * * ZFS_ERR_EXPORT_IN_PROGRESS * For the case that there is another ztest thread doing * an export concurrently. */ VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); int error = spa_destroy(zo->zo_pool); if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { fatal(0, "spa_destroy(%s) returned unexpected value %d", spa->spa_name, error); } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Start and then stop the MMP threads to ensure the startup and shutdown code * works properly. Actual protection and property-related code tested via ZTS. */ /* ARGSUSED */ void ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) { ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa = ztest_spa; if (zo->zo_mmp_test) return; /* * Since enabling MMP involves setting a property, it could not be done * while the pool is suspended. */ if (spa_suspended(spa)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); zfs_multihost_fail_intervals = 0; if (!spa_multihost(spa)) { spa->spa_multihost = B_TRUE; mmp_thread_start(spa); } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); mmp_signal_all_threads(); txg_wait_synced(spa_get_dsl(spa), 0); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); if (spa_multihost(spa)) { mmp_thread_stop(spa); spa->spa_multihost = B_FALSE; } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; uint64_t version, newversion; nvlist_t *nvroot, *props; char *name; if (ztest_opts.zo_mmp_test) return; /* dRAID added after feature flags, skip upgrade test. */ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) return; mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); /* * Clean up from previous runs. */ (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; break; case 2: initial_version = SPA_VERSION_RAIDZ2; break; case 3: initial_version = SPA_VERSION_RAIDZ3; break; } /* * Create a pool with a spa version that can be upgraded. Pick * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. */ do { version = ztest_random_spa_version(initial_version); } while (version > SPA_VERSION_BEFORE_FEATURES); props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { (void) printf("upgrading spa version from %llu to %llu\n", (u_longlong_t)version, (u_longlong_t)newversion); } spa_upgrade(spa, newversion); VERIFY3U(spa_version(spa), >, version); VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); kmem_strfree(name); mutex_exit(&ztest_vdev_lock); } static void ztest_spa_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: break; case ENOSPC: ztest_record_enospc(FTAG); break; default: fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } static void ztest_spa_discard_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint_discard(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_NO_CHECKPOINT: break; default: fatal(0, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } } /* ARGSUSED */ void ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; mutex_enter(&ztest_checkpoint_lock); if (ztest_random(2) == 0) { ztest_spa_checkpoint(spa); } else { ztest_spa_discard_checkpoint(spa); } mutex_exit(&ztest_checkpoint_lock); } static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { vdev_t *mvd; int c; if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) return (vd); for (c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != NULL) return (mvd); return (NULL); } static int spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); return (rvd->vdev_children); } /* * Verify that vdev_add() works as expected. */ /* ARGSUSED */ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { metaslab_group_t *mg; /* * find the first real slog in log allocation class */ mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); /* * We have to grab the zs_name_lock as writer to * prevent a race between removing a slog (dmu_objset_find) * and destroying a dataset. Removing the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_remove(spa, guid, B_FALSE); pthread_rwlock_unlock(&ztest_name_lock); switch (error) { case 0: case EEXIST: /* Generic zil_reset() error */ case EBUSY: /* Replay required */ case EACCES: /* Crypto key not loaded */ case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(0, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); /* * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); switch (error) { case 0: break; case ENOSPC: ztest_record_enospc("spa_vdev_add"); break; default: fatal(0, "spa_vdev_add() = %d", error); } } mutex_exit(&ztest_vdev_lock); } /* ARGSUSED */ void ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; /* * By default add a special vdev 50% of the time */ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && ztest_random(2) == 0)) { return; } mutex_enter(&ztest_vdev_lock); /* Only test with mirrors */ if (zs->zs_mirrors < 2) { mutex_exit(&ztest_vdev_lock); return; } /* requires feature@allocation_classes */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { mutex_exit(&ztest_vdev_lock); return; } leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); else if (error != 0) fatal(0, "spa_vdev_add() = %d", error); /* * 50% of the time allow small blocks in the special class */ if (error == 0 && spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { if (ztest_opts.zo_verbose >= 3) (void) printf("Enabling special VDEV small blocks\n"); (void) ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); } mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 3) { metaslab_class_t *mc; if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) mc = spa_special_class(spa); else mc = spa_dedup_class(spa); (void) printf("Added a %s mirrored vdev (of %d)\n", class, (int)mc->mc_groups); } } /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ /* ARGSUSED */ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; char *aux; char *path; uint64_t guid = 0; int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ztest_random(2) == 0) { sav = &spa->spa_spares; aux = ZPOOL_CONFIG_SPARES; } else { sav = &spa->spa_l2cache; aux = ZPOOL_CONFIG_L2CACHE; } mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* * Pick a random device to remove. */ vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) ignore_err = ENOTSUP; guid = svd->vdev_guid; } else { /* * Find an unused device we can add. */ zs->zs_vdev_aux = 0; for (;;) { int c; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, ztest_opts.zo_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) break; if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; zs->zs_vdev_aux++; } } spa_config_exit(spa, SCL_VDEV, FTAG); if (guid == 0) { /* * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { case 0: break; default: fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); } fnvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its * vdev state first to make sure we handle removal * of devices that have pending state changes. */ if (ztest_random(2) == 0) (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); switch (error) { case 0: case EBUSY: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: if (error != ignore_err) fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } } mutex_exit(&ztest_vdev_lock); umem_free(path, MAXPATHLEN); } /* * split a pool if it has mirror tlvdevs */ /* ARGSUSED */ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } /* clean up the old pool, if any */ (void) spa_destroy("splitp"); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); mutex_exit(&spa->spa_props_lock); VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children)); schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); for (c = 0; c < children; c++) { vdev_t *tvd = rvd->vdev_child[c]; nvlist_t **mchild; uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { schild[schildren] = fnvlist_alloc(); fnvlist_add_string(schild[schildren], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); fnvlist_add_uint64(schild[schildren], ZPOOL_CONFIG_IS_HOLE, 1); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; VERIFY0(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); schild[schildren++] = fnvlist_dup(mchild[0]); } /* OK, create a config that can be used to split */ split = fnvlist_alloc(); fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, lastlogid != 0 ? lastlogid : schildren); config = fnvlist_alloc(); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); for (c = 0; c < schildren; c++) fnvlist_free(schild[c]); free(schild); fnvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); (void) pthread_rwlock_unlock(&ztest_name_lock); fnvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); mutex_enter(&spa_namespace_lock); show_pool_stats(spa); show_pool_stats(spa_lookup("splitp")); mutex_exit(&spa_namespace_lock); ++zs->zs_splits; --zs->zs_mirrors; } mutex_exit(&ztest_vdev_lock); } /* * Verify that we can attach and detach devices. */ /* ARGSUSED */ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; if (ztest_opts.zo_mmp_test) return; oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * If a vdev is in the process of being removed, its removal may * finish while we are in progress, leading to an unexpected error * value. Don't bother trying to attach while we are in the middle * of removal. */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); /* * Pick a random top-level vdev. */ top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. */ leaf = ztest_random(leaves); /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* * If we're already doing an attach or replace, oldvd may be a * mirror vdev -- in which case, pick a random child. */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. Prior * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); if (error) goto out; error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) fatal(0, "detach (%s) returned %d", oldpath, error); goto out; } /* * For the new vdev, choose with equal probability between the two * standard paths (ending in either 'a' or 'b') or a random hot spare. */ if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; if (newvd->vdev_ops == &vdev_draid_spare_ops) newvd_is_dspare = B_TRUE; (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); } if (newvd) { /* * Reopen to ensure the vdev's asize field isn't stale. */ vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. * If it's smaller, the attach should fail. * If it's larger, and we're doing a replace, * we should get dynamic LUN growth when we're done. */ newsize = 10 * oldsize / (9 + ztest_random(3)); } /* * If pvd is not a mirror or root, the attach should fail with ENOTSUP, * unless it's a replace; in that case any non-replacing parent is OK. * * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) expected_error = ENOTSUP; else if (newvd_is_spare && (!replacing || oldvd_is_log)) expected_error = ENOTSUP; else if (newvd == oldvd) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) expected_error = ENOTSUP; else expected_error = 0; spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); /* * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; if (pvd->vdev_ops == &vdev_mirror_ops || pvd->vdev_ops == &vdev_root_ops) { rebuilding = !!ztest_random(2); } error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); fnvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, * then instead of failing with ENOTSUP we may either succeed, * fail with ENODEV, or fail with EOVERFLOW. */ if (expected_error == ENOTSUP && (error == 0 || error == ENODEV || error == EOVERFLOW)) expected_error = error; /* * If someone grew the LUN, the replacement may be too small. */ if (error == EOVERFLOW || error == EBUSY) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT || error == ZFS_ERR_RESILVER_IN_PROGRESS || error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); } out: mutex_exit(&ztest_vdev_lock); umem_free(oldpath, MAXPATHLEN); umem_free(newpath, MAXPATHLEN); } /* ARGSUSED */ void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; int error; mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } /* * Remove a random top-level vdev and wait for removal to finish. */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); error = spa_vdev_remove(spa, guid, B_FALSE); if (error == 0) { ztest_device_removal_active = B_TRUE; mutex_exit(&ztest_vdev_lock); /* * spa->spa_vdev_removal is created in a sync task that * is initiated via dsl_sync_task_nowait(). Since the * task may not run before spa_vdev_remove() returns, we * must wait at least 1 txg to ensure that the removal * struct has been created. */ txg_wait_synced(spa_get_dsl(spa), 0); while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); } else { mutex_exit(&ztest_vdev_lock); return; } /* * The pool needs to be scrubbed after completing device removal. * Failure to do so may result in checksum errors due to the * strategy employed by ztest_fault_inject() when selecting which * offset are redundant and can be damaged. */ error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } mutex_enter(&ztest_vdev_lock); ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } /* * Callback function which expands the physical size of the vdev. */ static vdev_t * grow_vdev(vdev_t *vd, void *arg) { spa_t *spa __maybe_unused = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); VERIFY0(ftruncate(fd, *newsize)); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } (void) close(fd); return (NULL); } /* * Callback function which expands a given vdev by calling vdev_online(). */ /* ARGSUSED */ static vdev_t * online_vdev(vdev_t *vd, void *arg) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint64_t guid = vd->vdev_guid; uint64_t generation = spa->spa_config_generation + 1; vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ spa_config_exit(spa, SCL_STATE, spa); error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If vdev_online returned an error or the underlying vdev_open * failed then we abort the expand. The only way to know that * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %llu, " "error %d\n", (u_longlong_t)newstate, error); } return (vd); } ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); /* * Since we dropped the lock we need to ensure that we're * still talking to the original vdev. It's possible this * vdev may have been detached/replaced while we were * trying to online it. */ if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %llu, state %llu, expected gen %llu, " "got gen %llu\n", (u_longlong_t)guid, (u_longlong_t)tvd->vdev_state, (u_longlong_t)generation, (u_longlong_t)spa->spa_config_generation); } return (vd); } return (NULL); } /* * Traverse the vdev tree calling the supplied function. * We continue to walk the tree until we either have walked all * children or we receive a non-NULL return from the callback. * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ static vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { uint_t c; if (vd->vdev_ops->vdev_op_leaf) { if (func == NULL) return (vd); else return (func(vd, arg)); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) return (cvd); } return (NULL); } /* * Verify that dynamic LUN growth works as expected. */ /* ARGSUSED */ void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; size_t psize, newsize; uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If there is a vdev removal in progress, it could complete while * we are running, in which case we would not be able to verify * that the metaslab_class space increased (because it decreases * when the device removal completes). */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; mg = tvd->vdev_mg; mc = mg->mg_class; old_ms_count = tvd->vdev_ms_count; old_class_space = metaslab_class_get_space(mc); /* * Determine the size of the first leaf vdev associated with * our top-level device. */ vd = vdev_walk_tree(tvd, NULL, NULL); ASSERT3P(vd, !=, NULL); ASSERT(vd->vdev_ops->vdev_op_leaf); psize = vd->vdev_psize; /* * We only try to expand the vdev if it's healthy, less than 4x its * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } ASSERT3U(psize, >, 0); newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } /* * Growing the vdev is a two step process: * 1). expand the physical size (i.e. relabel) * 2). online the vdev to create the new metaslabs */ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } spa_config_exit(spa, SCL_STATE, spa); /* * Expanding the LUN will update the config asynchronously, * thus we must wait for the async thread to complete any * pending tasks before proceeding. */ for (;;) { boolean_t done; mutex_enter(&spa->spa_async_lock); done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); mutex_exit(&spa->spa_async_lock); if (done) break; txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); } spa_config_enter(spa, SCL_STATE, spa, RW_READER); tvd = spa->spa_root_vdev->vdev_child[top]; new_ms_count = tvd->vdev_ms_count; new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", old_ms_count, new_ms_count); } /* * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { fatal(0, "LUN expansion failed: class_space %llu < %llu\n", old_class_space, new_class_space); } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); } /* * Verify that dmu_objset_{create,destroy,open,close} work as expected. */ /* ARGSUSED */ static void ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { /* * Create the objects common to all ztest datasets. */ VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); } static int ztest_dataset_create(char *dsname) { int err; uint64_t rand; dsl_crypto_params_t *dcp = NULL; /* * 50% of the time, we create encrypted datasets * using a random cipher suite and a hard-coded * wrapping key. */ rand = ztest_random(2); if (rand != 0) { nvlist_t *crypto_args = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); /* slight bias towards the default cipher suite */ rand = ztest_random(ZIO_CRYPT_FUNCTIONS); if (rand < ZIO_CRYPT_AES_128_CCM) rand = ZIO_CRYPT_ON; fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); /* * These parameters aren't really used by the kernel. They * are simply stored so that userspace knows how to load * the wrapping key. */ fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); fnvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, crypto_args, &dcp)); /* * Cycle through all available encryption implementations * to verify interoperability. */ VERIFY0(gcm_impl_set("cycle")); VERIFY0(aes_impl_set("cycle")); fnvlist_free(crypto_args); fnvlist_free(props); } err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, ztest_objset_create_cb, NULL); dsl_crypto_params_free(dcp, !!err); rand = ztest_random(100); if (err || rand < 80) return (err); if (ztest_opts.zo_verbose >= 5) (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } /* ARGSUSED */ static int ztest_objset_destroy_cb(const char *name, void *arg) { objset_t *os; dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } dmu_objset_disown(os, B_TRUE, FTAG); /* * Destroy the dataset. */ if (strchr(name, '@') != NULL) { VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); } else { error = dsl_destroy_head(name); if (error == ENOSPC) { /* There could be checkpoint or insufficient slop */ ztest_record_enospc(FTAG); } else if (error != EBUSY) { /* There could be a hold on this dataset */ ASSERT0(error); } } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } if (error != 0 && error != EEXIST) { fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, (u_longlong_t)id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); } /* ARGSUSED */ void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_ds_t *zdtmp; int iters; int error; objset_t *os, *os2; char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; int i; zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) snprintf(name, sizeof (name), "%s/temp_%llu", ztest_opts.zo_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dsl_destroy_head() * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os) == 0) { ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); dmu_objset_disown(os, B_TRUE, FTAG); } /* * There may be an old instance of the dataset we're about to * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. */ VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); /* * Verify that we can create a new dataset. */ error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os)); ztest_zd_init(zdtmp, NULL, os); /* * Open the intent log for it. */ zilog = zil_open(os, ztest_get_data); /* * Put some objects in there, do a little I/O to them, * and randomly take a couple of snapshots along the way. */ iters = ztest_random(5); for (i = 0; i < iters; i++) { ztest_dmu_object_alloc_free(zdtmp, id); if (ztest_random(iters) == 0) (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ VERIFY3U(EEXIST, ==, dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); /* * Verify that we can hold an objset that is also owned. */ VERIFY0(dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* * Verify that we cannot own an objset that is already owned. */ VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os2)); zil_close(zilog); dmu_objset_disown(os, B_TRUE, FTAG); ztest_zd_fini(zdtmp); out: (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(zdtmp, sizeof (ztest_ds_t)); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Cleanup non-standard snapshots and clones. */ static void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%llu", osname, (u_longlong_t)id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%llu", osname, (u_longlong_t)id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%llu", clone1name, (u_longlong_t)id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%llu", osname, (u_longlong_t)id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%llu", clone1name, (u_longlong_t)id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } /* * Verify dsl_dataset_promote handles EBUSY */ void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { objset_t *os; char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; char *osname = zd->zd_name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%llu", osname, (u_longlong_t)id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%llu", osname, (u_longlong_t)id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%llu", clone1name, (u_longlong_t)id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%llu", osname, (u_longlong_t)id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%llu", clone1name, (u_longlong_t)id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); } error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os); if (error) fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, B_TRUE, FTAG); ztest_record_enospc(FTAG); goto out; } if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); dmu_objset_disown(os, B_TRUE, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 4 /* * Verify that dmu_object_{alloc,free} work as expected. */ void ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; int batchsize; int size; int b; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); batchsize = OD_ARRAY_SIZE; for (b = 0; b < batchsize; b++) ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); /* * Destroy the previous batch of objects, create a new batch, * and do some I/O on the new objects. */ if (ztest_object_init(zd, od, size, B_TRUE) != 0) return; while (ztest_random(4 * batchsize) != 0) ztest_io(zd, od[ztest_random(batchsize)].od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); umem_free(od, size); } /* * Rewind the global allocator to verify object allocation backfilling. */ void ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; uint64_t object; /* * Rewind the global allocator randomly back to a lower object number * to force backfilling and reclamation of recently freed dnodes. */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); mutex_exit(&os->os_obj_lock); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 /* * Verify that dmu_{read,write} work as expected. */ void ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { int size; ztest_od_t *od; objset_t *os = zd->zd_os; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); dmu_tx_t *tx; int i, freeit, error; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is arbitrary. It doesn't have to be a power of two, * and it doesn't have any relation to the object blocksize. * The only requirement is that it can hold at least two bufwads. * * Normally, we write the bufwad to each of these locations. * However, free_percent of the time we instead write zeroes to * packobj and perform a dmu_free_range() on bigobj. By comparing * bigobj to packobj, we can verify that the DMU is correctly * tracking which parts of an object are allocated and free, * and that the contents of the allocated blocks are correct. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; ASSERT3U(chunksize, ==, od[1].od_gen); /* * Prefetch a random chunk of the big object. * Our aim here is to get some async reads in flight * for blocks that we may free below; the DMU should * handle this race correctly. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); /* * free_percent of the time, free a range of bigobj rather than * overwriting it. */ freeit = (ztest_random(100) < free_percent); /* * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); /* This accounts for setting the checksum/compression. */ dmu_tx_hold_bonus(tx, bigobj); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); return; } enum zio_checksum cksum; do { cksum = (enum zio_checksum) ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); dmu_object_set_checksum(os, bigobj, cksum, tx); enum zio_compress comp; do { comp = (enum zio_compress) ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); if (freeit) { bzero(pack, sizeof (bufwad_t)); } else { pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); } *bigH = *pack; *bigT = *pack; } /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(bcmp(packbuf, packcheck, packsize)); ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); } static void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { uint64_t i; bufwad_t *pack; bufwad_t *bigH; bufwad_t *bigT; /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); *bigH = *pack; *bigT = *pack; } } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 void ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; uint64_t i; int error; int size; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t blocksize = ztest_random_blocksize(); uint64_t chunksize = blocksize; uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 9; dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is set equal to bigobj block size so that * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; ASSERT3U(chunksize, ==, od[1].od_gen); VERIFY0(dmu_object_info(os, bigobj, &doi)); VERIFY(ISP2(doi.doi_data_block_size)); VERIFY3U(chunksize, ==, doi.doi_data_block_size); VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); /* * Iteration 0 test zcopy for DB_UNCACHED dbufs. * Iteration 1 test zcopy to already referenced dbufs. * Iteration 2 test zcopy to dirty dbuf in the same txg. * Iteration 3 test zcopy to dbuf dirty in previous txg. * Iteration 4 test zcopy when dbuf is no longer dirty. * Iteration 5 test zcopy when it can't be done. * Iteration 6 one more zcopy write. */ for (i = 0; i < 7; i++) { uint64_t j; uint64_t off; /* * In iteration 5 (i == 5) use arcbufs * that don't match bigobj blksz to test * dmu_assign_arcbuf_by_dbuf() when it can't directly * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { bigbuf_arcbufs[2 * j] = dmu_request_arcbuf(bonus_db, chunksize / 2); bigbuf_arcbufs[2 * j + 1] = dmu_request_arcbuf(bonus_db, chunksize / 2); } } /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( bigbuf_arcbufs[2 * j]); dmu_return_arcbuf( bigbuf_arcbufs[2 * j + 1]); } } umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); dmu_buf_rele(bonus_db, FTAG); return; } /* * 50% of the time don't read objects in the 1st iteration to * test dmu_assign_arcbuf_by_dbuf() for the case when there are * no existing dbufs for the specified offsets. */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[2 * j]->b_data, chunksize / 2); bcopy((caddr_t)bigbuf + (off - bigoff) + chunksize / 2, bigbuf_arcbufs[2 * j + 1]->b_data, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[j], tx)); } else { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[2 * j], tx)); VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off + chunksize / 2, bigbuf_arcbufs[2 * j + 1], tx)); } if (i == 1) { dmu_buf_rele(dbt, FTAG); } } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(bcmp(packbuf, packcheck, packsize)); ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } if (i == 2) { txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); } else if (i == 3) { txg_wait_synced(dmu_objset_pool(os), 0); } } dmu_buf_rele(bonus_db, FTAG); umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); } /* ARGSUSED */ void ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); uint64_t offset = (1ULL << (ztest_random(20) + 43)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* * Have multiple threads write to large offsets in an object * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) return; while (ztest_random(10) != 0) ztest_io(zd, od->od_object, offset); umem_free(od, sizeof (ztest_od_t)); } void ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); uint64_t count = ztest_random(20) + 1; uint64_t blocksize = ztest_random_blocksize(); void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } ztest_prealloc(zd, od->od_object, offset, count * blocksize); data = umem_zalloc(blocksize, UMEM_NOFAIL); while (ztest_random(count) != 0) { uint64_t randoff = offset + (ztest_random(count) * blocksize); if (ztest_write(zd, od->od_object, randoff, blocksize, data) != 0) break; while (ztest_random(4) != 0) ztest_io(zd, od->od_object, randoff); } umem_free(data, blocksize); umem_free(od, sizeof (ztest_od_t)); } /* * Verify that zap_{create,destroy,add,remove,update} work as expected. */ #define ZTEST_ZAP_MIN_INTS 1 #define ZTEST_ZAP_MAX_INTS 4 #define ZTEST_ZAP_MAX_PROPS 1000 void ztest_zap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; dmu_tx_t *tx; char propname[100], txgname[100]; int error; char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Generate a known hash collision, and verify that * we can lookup and remove both entries. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; for (i = 0; i < 2; i++) { value[i] = i; VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); VERIFY0( zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { VERIFY0(zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); /* * Generate a bunch of random entries. */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); bzero(value, sizeof (value)); last_txg = 0; /* * If these zap entries already exist, validate their contents. */ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == 0) { ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); VERIFY0(zap_lookup(os, object, txgname, zl_intsize, zl_ints, &last_txg)); VERIFY0(zap_length(os, object, propname, &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); VERIFY0(zap_lookup(os, object, propname, zl_intsize, zl_ints, value)); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); } } else { ASSERT3U(error, ==, ENOENT); } /* * Atomically update two entries in our zap object. * The first is named txg_%llu, and contains the txg * in which the property was last updated. The second * is named prop_%llu, and the nth element of its value * should be txg + object + n. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; if (last_txg > txg) fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); /* * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) goto out; ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; VERIFY0(zap_remove(os, object, txgname, tx)); VERIFY0(zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); out: umem_free(od, sizeof (ztest_od_t)); } /* * Test case to test the upgrading of a microzap to fatzap. */ void ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object, txg; int i; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Add entries to this ZAP and make sure it spills over * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (i = 0; i < 2050; i++) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t value = i; dmu_tx_t *tx; int error; (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", (u_longlong_t)id, (u_longlong_t)value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; error = zap_add(os, object, name, sizeof (uint64_t), 1, &value, tx); ASSERT(error == 0 || error == EEXIST); dmu_tx_commit(tx); } out: umem_free(od, sizeof (ztest_od_t)); } /* ARGSUSED */ void ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; dmu_tx_t *tx; int i, namelen, error; int micro = ztest_random(2); char name[20], string_value[20]; void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } object = od->od_object; /* * Generate a random name of the form 'xxx.....' where each * x is a random printable character and the dots are dots. * There are 94 such characters, and the name length goes from * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. */ namelen = ztest_random(sizeof (name) - 5) + 5 + 1; for (i = 0; i < 3; i++) name[i] = '!' + ztest_random('~' - '!' + 1); for (; i < namelen - 1; i++) name[i] = '.'; name[i] = '\0'; if ((namelen & 1) || micro) { wsize = sizeof (txg); wc = 1; data = &txg; } else { wsize = 1; wc = namelen; data = string_value; } count = -1ULL; VERIFY0(zap_count(os, object, &count)); ASSERT3S(count, !=, -1ULL); /* * Select an operation: length, lookup, add, update, remove. */ i = ztest_random(5); if (i >= 2) { tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(od, sizeof (ztest_od_t)); return; } bcopy(name, string_value, namelen); } else { tx = NULL; txg = 0; bzero(string_value, namelen); } switch (i) { case 0: error = zap_length(os, object, name, &zl_wsize, &zl_wc); if (error == 0) { ASSERT3U(wsize, ==, zl_wsize); ASSERT3U(wc, ==, zl_wc); } else { ASSERT3U(error, ==, ENOENT); } break; case 1: error = zap_lookup(os, object, name, wsize, wc, data); if (error == 0) { if (data == string_value && bcmp(name, data, namelen) != 0) fatal(0, "name '%s' != val '%s' len %d", name, data, namelen); } else { ASSERT3U(error, ==, ENOENT); } break; case 2: error = zap_add(os, object, name, wsize, wc, data, tx); ASSERT(error == 0 || error == EEXIST); break; case 3: VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); break; case 4: error = zap_remove(os, object, name, tx); ASSERT(error == 0 || error == ENOENT); break; } if (tx != NULL) dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Commit callback data. */ typedef struct ztest_cb_data { list_node_t zcd_node; uint64_t zcd_txg; int zcd_expected_err; boolean_t zcd_added; boolean_t zcd_called; spa_t *zcd_spa; } ztest_cb_data_t; /* This is the actual commit callback function */ static void ztest_commit_callback(void *arg, int error) { ztest_cb_data_t *data = arg; uint64_t synced_txg; VERIFY3P(data, !=, NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) fatal(0, "commit callback of txg %" PRIu64 " called prematurely" ", last synced txg = %" PRIu64 "\n", data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; if (error == ECANCELED) { ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* * The private callback data should be destroyed here, but * since we are going to check the zcd_called field after * dmu_tx_abort(), we will destroy it there. */ return; } ASSERT(data->zcd_added); ASSERT3U(data->zcd_txg, !=, 0); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* See if this cb was called more quickly */ if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) zc_min_txg_delay = synced_txg - data->zcd_txg; /* Remove our callback from the list */ list_remove(&zcl.zcl_callbacks, data); (void) mutex_exit(&zcl.zcl_callbacks_lock); umem_free(data, sizeof (ztest_cb_data_t)); } /* Allocate and initialize callback data structure */ static ztest_cb_data_t * ztest_create_cb_data(objset_t *os, uint64_t txg) { ztest_cb_data_t *cb_data; cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); cb_data->zcd_txg = txg; cb_data->zcd_spa = dmu_objset_spa(os); list_link_init(&cb_data->zcd_node); return (cb_data); } /* * Commit callback test. */ void ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; ztest_cb_data_t *cb_data[3], *tmp_cb; uint64_t old_txg, txg; int i, error = 0; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } tx = dmu_tx_create(os); cb_data[0] = ztest_create_cb_data(os, 0); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); /* Every once in a while, abort the transaction on purpose */ if (ztest_random(100) == 0) error = -1; if (!error) error = dmu_tx_assign(tx, TXG_NOWAIT); txg = error ? 0 : dmu_tx_get_txg(tx); cb_data[0]->zcd_txg = txg; cb_data[1] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); if (error) { /* * It's not a strict requirement to call the registered * callbacks from inside dmu_tx_abort(), but that's what * it's supposed to happen in the current implementation * so we will check for that. */ for (i = 0; i < 2; i++) { cb_data[i]->zcd_expected_err = ECANCELED; VERIFY(!cb_data[i]->zcd_called); } dmu_tx_abort(tx); for (i = 0; i < 2; i++) { VERIFY(cb_data[i]->zcd_called); umem_free(cb_data[i], sizeof (ztest_cb_data_t)); } umem_free(od, sizeof (ztest_od_t)); return; } cb_data[2] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* * Read existing data to make sure there isn't a future leak. */ VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, old_txg, txg); dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* * Since commit callbacks don't have any ordering requirement and since * it is theoretically possible for a commit callback to be called * after an arbitrary amount of time has elapsed since its txg has been * synced, it is difficult to reliably determine whether a commit * callback hasn't been called due to high load or due to a flawed * implementation. * * In practice, we will assume that if after a certain number of txgs a * commit callback hasn't been called, then most likely there's an * implementation bug.. */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { fatal(0, "Commit callback threshold exceeded, oldest txg: %" PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); } /* * Let's find the place to insert our callbacks. * * Even though the list is ordered by txg, it is possible for the * insertion point to not be the end because our txg may already be * quiescing at this point and other callbacks in the open txg * (from other objsets) may have sneaked in. */ tmp_cb = list_tail(&zcl.zcl_callbacks); while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); /* Add the 3 callbacks to the list */ for (i = 0; i < 3; i++) { if (tmp_cb == NULL) list_insert_head(&zcl.zcl_callbacks, cb_data[i]); else list_insert_after(&zcl.zcl_callbacks, tmp_cb, cb_data[i]); cb_data[i]->zcd_added = B_TRUE; VERIFY(!cb_data[i]->zcd_called); tmp_cb = cb_data[i]; } zc_cb_counter += 3; (void) mutex_exit(&zcl.zcl_callbacks_lock); dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Visit each object in the dataset. Verify that its properties * are consistent what was stored in the block tag when it was created, * and that its unused bonus buffer space has not been overwritten. */ /* ARGSUSED */ void ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; uint64_t obj; int err = 0; for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ztest_block_tag_t *bt = NULL; dmu_object_info_t doi; dmu_buf_t *db; ztest_object_lock(zd, obj, RL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_size >= sizeof (*bt)) bt = ztest_bt_bonus(db); if (bt && bt->bt_magic == BT_MAGIC) { ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, bt->bt_offset, bt->bt_gen, bt->bt_txg, bt->bt_crtxg); ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); } dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, obj); } } /* ARGSUSED */ void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { zfs_prop_t proplist[] = { ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; int p; (void) pthread_rwlock_rdlock(&ztest_name_lock); for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, ztest_random_blocksize(), (int)ztest_random(2))); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); fnvlist_free(props); (void) pthread_rwlock_unlock(&ztest_name_lock); } static int user_release_one(const char *snapname, const char *holdname) { nvlist_t *snaps, *holds; int error; snaps = fnvlist_alloc(); holds = fnvlist_alloc(); fnvlist_add_boolean(holds, holdname); fnvlist_add_nvlist(snaps, snapname, holds); fnvlist_free(holds); error = dsl_dataset_user_release(snaps, NULL); fnvlist_free(snaps); return (error); } /* * Test snapshot hold/release and deferred destroy. */ void ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { int error; objset_t *os = zd->zd_os; objset_t *origin; char snapname[100]; char fullname[100]; char clonename[100]; char tag[100]; char osname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *holds; (void) pthread_rwlock_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", (u_longlong_t)id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%llu", osname, (u_longlong_t)id); (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); /* * Clean up from any previous run. */ error = dsl_destroy_head(clonename); if (error != ENOENT) ASSERT0(error); error = user_release_one(fullname, tag); if (error != ESRCH && error != ENOENT) ASSERT0(error); error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != ENOENT) ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); goto out; } fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); fnvlist_add_string(holds, fullname, tag); error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); if (error == ENOSPC) { ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Inject random faults into the on-disk data. */ /* ARGSUSED */ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; char *path0; char *pathrand; size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); /* * Device removal is in progress, fault injection must be disabled * until it completes and the pool is scrubbed. The fault injection * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); goto out; } maxfaults = MAXFAULTS(zs); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); /* * While ztest is running the number of leaves will not change. This * is critical for the fault injection logic as it determines where * errors can be safely injected such that they are always repairable. * * When restarting ztest a different number of leaves may be requested * which will shift the regions to be damaged. This is fine as long * as the pool has been scrubbed prior to using the new mapping. * Failure to do can result in non-repairable damage being injected. */ if (ztest_pool_scrubbed == B_FALSE) goto out; /* * Grab the name lock as reader. There are some operations * which don't like to have their vdevs changed while * they are in progress (i.e. spa_change_guid). Those * operations will have grabbed the name lock as writer. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (ztest_random(2) == 0) { /* * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, * and to the random leaf we selected. We'll induce transient * write failures and random online/offline activity on leaf 0, * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; /* * If the top-level vdev needs to be resilvered * then we only allow faults on the device that is * resilvering. */ if (vd0 != NULL && maxfaults != 1 && (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwriteable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll * have enough redundancy. If maxfaults == 1, the * combination of this with injection of random data * corruption below exceeds the pool's fault tolerance. */ vdev_file_t *vf = vd0->vdev_tsd; zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_file->f_fd); vf->vf_file->f_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; } } else { /* * Inject errors on an l2cache device. */ spa_aux_vdev_t *sav = &spa->spa_l2cache; if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); goto out; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strcpy(path0, vd0->vdev_path); (void) strcpy(pathrand, vd0->vdev_path); leaf = 0; leaves = 1; maxfaults = INT_MAX; /* no limit on cache devices */ } spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing * with a slog, randomly online/offline vd0. */ if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); /* * We have to grab the zs_name_lock as writer to * prevent a race between offlining a slog and * destroying a dataset. Offlining the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ if (islog) (void) pthread_rwlock_wrlock(&ztest_name_lock); VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); if (islog) (void) pthread_rwlock_unlock(&ztest_name_lock); } else { /* * Ideally we would like to be able to randomly * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from * doing so. We grab the ztest_vdev_lock here to * prevent a race between injection testing and * aux_vdev removal. */ mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); mutex_exit(&ztest_vdev_lock); } } if (maxfaults == 0) goto out; /* * We have at least single-fault tolerance, so inject data corruption. */ fd = open(pathrand, O_RDWR); if (fd == -1) /* we hit a gap in the device namespace */ goto out; fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { /* * The offset must be chosen carefully to ensure that * we do not inject a given logical block with errors * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * * To achieve this we divide each leaf device into * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). * Each chunk is further divided into error-injection * ranges (can accept errors) and clear ranges (we do * not inject errors in those). Each error-injection * range can accept errors only for a single leaf vdev. * Error-injection ranges are separated by clear ranges. * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 * 32M to 64M: clear range - no injection allowed * 64M to 96M: injection range for leaf 1 * 96M to 128M: clear range - no injection allowed * 128M to 160M: injection range for leaf 2 * 160M to 192M: clear range - no injection allowed * * Each clear range must be large enough such that a * single block cannot straddle it. This way a block * can't be a target in two different injection ranges * (on different leaf vdevs). */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); /* * Only allow damage to the labels at one end of the vdev. * * If all labels are damaged, the device will be totally * inaccessible, which will result in loss of data, * because we also damage (parts of) the other side of * the mirror/raidz. * * Additionally, we will always have both an even and an * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; /* * The two end labels are stored at the "end" of the disk, but * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } (void) close(fd); out: umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). */ static int ztest_scrub_impl(spa_t *spa) { int error = spa_scan(spa, POOL_SCAN_SCRUB); if (error) return (error); while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); if (spa_get_errlog_size(spa) > 0) return (ECKSUM); ztest_pool_scrubbed = B_TRUE; return (0); } /* * Scrub the pool. */ /* ARGSUSED */ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error; /* * Scrub in progress by device removal. */ if (ztest_device_removal_active) return; /* * Start a scrub, wait a moment, then force a restart. */ (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } /* * Change the guid for the pool. */ /* ARGSUSED */ void ztest_reguid(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; uint64_t orig, load; int error; if (ztest_opts.zo_mmp_test) return; orig = spa_guid(spa); load = spa_load_guid(spa); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_change_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); if (error != 0) return; if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %llu -> %llu\n", (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); VERIFY3U(load, ==, spa_load_guid(spa)); } void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; void *buf; struct abd *abd_data, *abd_meta; uint32_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_byteswap; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_byteswap; fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); fletcher_4_native(buf, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - data */ abd_fletcher_4_byteswap(abd_data, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_data, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - metadata */ abd_fletcher_4_byteswap(abd_meta, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_meta, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); } umem_free(buf, size); abd_free(abd_data); abd_free(abd_meta); } } void ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) { void *buf; size_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_bswap; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_bswap; size_t pos = 0; ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); while (pos < size) { size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) inc += P2ALIGN(ztest_random(64), sizeof (uint32_t)); if (inc > (size - pos)) inc = size - pos; fletcher_4_incremental_native(buf + pos, inc, &zc); fletcher_4_incremental_byteswap(buf + pos, inc, &zc_bswap); pos += inc; } VERIFY3U(pos, ==, size); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); /* * verify if incremental on the whole buffer is * equivalent to non-incremental version */ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); fletcher_4_incremental_native(buf, size, &zc); fletcher_4_incremental_byteswap(buf, size, &zc_bswap); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); } umem_free(buf, size); } } static int ztest_set_global_vars(void) { for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { char *kv = ztest_opts.zo_gvars[i]; VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); VERIFY3U(strlen(kv), >, 0); int err = set_global_var(kv); if (ztest_opts.zo_verbose > 0) { (void) printf("setting global var %s ... %s\n", kv, err ? "failed" : "ok"); } if (err != 0) { (void) fprintf(stderr, "failed to set global var '%s'\n", kv); return (err); } } return (0); } static char ** ztest_global_vars_to_zdb_args(void) { char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); char **cur = args; for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { char *kv = ztest_opts.zo_gvars[i]; *cur = "-o"; cur++; *cur = strdup(kv); cur++; } ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); *cur = NULL; return (args); } /* The end of strings is indicated by a NULL element */ static char * join_strings(char **strings, const char *sep) { size_t totallen = 0; for (char **sp = strings; *sp != NULL; sp++) { totallen += strlen(*sp); totallen += strlen(sep); } if (totallen > 0) { ASSERT(totallen >= strlen(sep)); totallen -= strlen(sep); } size_t buflen = totallen + 1; char *o = malloc(buflen); /* trailing 0 byte */ o[0] = '\0'; for (char **sp = strings; *sp != NULL; sp++) { size_t would; would = strlcat(o, *sp, buflen); VERIFY3U(would, <, buflen); if (*(sp+1) == NULL) { break; } would = strlcat(o, sep, buflen); VERIFY3U(would, <, buflen); } ASSERT3S(strlen(o), ==, totallen); return (o); } static int ztest_check_path(char *path) { struct stat s; /* return true on success */ return (!stat(path, &s)); } static void ztest_get_zdb_bin(char *bin, int len) { char *zdb_path; /* * Try to use ZDB_PATH and in-tree zdb path. If not successful, just * let popen to search through PATH. */ if ((zdb_path = getenv("ZDB_PATH"))) { strlcpy(bin, zdb_path, len); /* In env */ if (!ztest_check_path(bin)) { ztest_dump_core = 0; fatal(1, "invalid ZDB_PATH '%s'", bin); } return; } VERIFY3P(realpath(getexecname(), bin), !=, NULL); if (strstr(bin, "/ztest/")) { strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ strcat(bin, "/zdb/zdb"); if (ztest_check_path(bin)) return; } strcpy(bin, "zdb"); } static vdev_t * ztest_random_concrete_vdev_leaf(vdev_t *vd) { if (vd == NULL) return (NULL); if (vd->vdev_children == 0) return (vd); vdev_t *eligible[vd->vdev_children]; int eligible_idx = 0, i; for (i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_top->vdev_removing) continue; if (cvd->vdev_children > 0 || (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { eligible[eligible_idx++] = cvd; } } VERIFY3S(eligible_idx, >, 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); } /* ARGSUSED */ void ztest_initialize(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; case POOL_INITIALIZE_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start initialize %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_INITIALIZE_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* ARGSUSED */ void ztest_trim(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_trim_thread != NULL; zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); uint64_t rate = 1 << ztest_random(30); boolean_t partial = (ztest_random(5) > 0); boolean_t secure = (ztest_random(5) > 0); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, secure, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_TRIM_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; case POOL_TRIM_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start TRIM %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_TRIM_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* * Verify pool integrity by running zdb. */ static void ztest_run_zdb(char *pool) { int status; char *bin; char *zdb; char *zbuf; const int len = MAXPATHLEN + MAXNAMELEN + 20; FILE *fp; bin = umem_alloc(len, UMEM_NOFAIL); zdb = umem_alloc(len, UMEM_NOFAIL); zbuf = umem_alloc(1024, UMEM_NOFAIL); ztest_get_zdb_bin(bin, len); char **set_gvars_args = ztest_global_vars_to_zdb_args(); char *set_gvars_args_joined = join_strings(set_gvars_args, " "); free(set_gvars_args); size_t would = snprintf(zdb, len, "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", set_gvars_args_joined, ztest_opts.zo_dir, pool); ASSERT3U(would, <, len); free(set_gvars_args_joined); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); fp = popen(zdb, "r"); while (fgets(zbuf, 1024, fp) != NULL) if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); if (status == 0) goto out; ztest_dump_core = 0; if (WIFEXITED(status)) fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); out: umem_free(bin, len); umem_free(zdb, len); umem_free(zbuf, 1024); } static void ztest_walk_pool_directory(char *header) { spa_t *spa = NULL; if (ztest_opts.zo_verbose >= 6) (void) printf("%s\n", header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } static void ztest_spa_import_export(char *oldname, char *newname) { nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; int error; if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } /* * Clean up from previous runs. */ (void) spa_destroy(newname); /* * Get the pool's configuration and guid. */ VERIFY0(spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); ztest_walk_pool_directory("pools before export"); /* * Export it. */ VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); /* * Try to import it. */ newconfig = spa_tryimport(config); ASSERT3P(newconfig, !=, NULL); fnvlist_free(newconfig); /* * Import it under the new name. */ error = spa_import(newname, config, NULL, 0); if (error != 0) { dump_nvlist(config, 0); fatal(B_FALSE, "couldn't import pool %s as %s: error %u", oldname, newname, error); } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. */ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ VERIFY0(spa_open(newname, &spa, FTAG)); ASSERT3U(pool_guid, ==, spa_guid(spa)); spa_close(spa, FTAG); fnvlist_free(config); } static void ztest_resume(spa_t *spa) { if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa); } static void ztest_resume_thread(void *arg) { spa_t *spa = arg; while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); /* * Periodically change the zfs_compressed_arc_enabled setting. */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); /* * Periodically change the zfs_abd_scatter_enabled setting. */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); } static void ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; hrtime_t delay, overdue, last_run = gethrtime(); delay = (zs->zs_thread_stop - zs->zs_thread_start) + MSEC2NSEC(zfs_deadman_synctime_ms); while (!ztest_exiting) { /* * Wait for the delay timer while checking occasionally * if we should stop. */ if (gethrtime() < last_run + delay) { (void) poll(NULL, 0, 1000); continue; } /* * If the pool is suspended then fail immediately. Otherwise, * check to see if the pool is making any progress. If * vdev_deadman() discovers that there hasn't been any recent * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { fatal(0, "aborting test after %llu seconds because " "pool has transitioned to a suspended state.", zfs_deadman_synctime_ms / 1000); } vdev_deadman(spa->spa_root_vdev, FTAG); /* * If the process doesn't complete within a grace period of * zfs_deadman_synctime_ms over the expected finish time, * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); if (gethrtime() > overdue) { fatal(0, "aborting test after %llu seconds because " "the process is overdue for termination.", (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", (gethrtime() - zs->zs_proc_start) / NANOSEC); last_run = gethrtime(); delay = MSEC2NSEC(zfs_deadman_checktime_ms); } thread_exit(); } static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) { ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); int i; for (i = 0; i < zi->zi_iters; i++) zi->zi_func(zd, id); functime = gethrtime() - functime; atomic_add_64(&zc->zc_count, 1); atomic_add_64(&zc->zc_time, functime); if (ztest_opts.zo_verbose >= 4) (void) printf("%6.2f sec in %s\n", (double)functime / NANOSEC, zi->zi_funcname); } static void ztest_thread(void *arg) { int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ if (now > zs->zs_thread_kill) ztest_kill(zs); /* * If we're getting ENOSPC with some regularity, stop. */ if (zs->zs_enospc_count > 10) break; /* * Pick a random function to execute. */ rand = ztest_random(ZTEST_FUNCS); zi = &ztest_info[rand]; zc = ZTEST_GET_SHARED_CALLSTATE(rand); call_next = zc->zc_next; if (now >= call_next && atomic_cas_64(&zc->zc_next, call_next, call_next + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { ztest_execute(rand, zi, id); } } thread_exit(); } static void ztest_dataset_name(char *dsname, char *pool, int d) { (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void ztest_dataset_destroy(int d) { char name[ZFS_MAX_DATASET_NAME_LEN]; int t; ztest_dataset_name(name, ztest_opts.zo_pool, d); if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* * Cleanup any non-standard clones and snapshots. In general, * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ for (t = d; t < ztest_opts.zo_threads; t += ztest_opts.zo_datasets) ztest_dsl_dataset_cleanup(name, t); (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } static void ztest_dataset_dirobj_verify(ztest_ds_t *zd) { uint64_t usedobjs, dirobjs, scratch; /* * ZTEST_DIROBJ is the object directory for the entire dataset. * Therefore, the number of objects in use should equal the * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. * If not, we have an object leak. * * Note that we can only check this in ztest_dataset_open(), * when the open-context and syncing-context values agree. * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } static int ztest_dataset_open(int d) { ztest_ds_t *zd = &ztest_ds[d]; uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[ZFS_MAX_DATASET_NAME_LEN]; int error; ztest_dataset_name(name, ztest_opts.zo_pool, d); (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); if (error == ENOSPC) { (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, zd, &os)); (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) fatal(0, "missing log records: claimed %llu < committed %llu", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); zil_replay(os, zd, ztest_replay_vector); ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", zd->zd_name, (u_longlong_t)zilog->zl_parse_blk_count, (u_longlong_t)zilog->zl_parse_lr_count, (u_longlong_t)zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) fatal(0, "missing log records: replayed %llu < committed %llu", zilog->zl_replaying_seq, committed_seq); return (0); } static void ztest_dataset_close(int d) { ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_disown(zd->zd_os, B_TRUE, zd); ztest_zd_fini(zd); } /* ARGSUSED */ static int ztest_replay_zil_cb(const char *name, void *arg) { objset_t *os; ztest_ds_t *zdtmp; VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && ztest_opts.zo_verbose >= 6) { zilog_t *zilog = dmu_objset_zil(os); (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", name, (u_longlong_t)zilog->zl_parse_blk_count, (u_longlong_t)zilog->zl_parse_lr_count, (u_longlong_t)zilog->zl_replaying_seq); } umem_free(zdtmp, sizeof (ztest_ds_t)); dmu_objset_disown(os, B_TRUE, FTAG); return (0); } static void ztest_freeze(void) { ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; /* * Force the first log block to be transactionally allocated. * We have to do this before we freeze the pool -- otherwise * the log chain won't be anchored. */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); /* * Freeze the pool. This stops spa_sync() from doing anything, * so that the only way to record changes from now on is the ZIL. */ spa_freeze(spa); /* * Because it is hard to predict how much space a write will actually * require beforehand, we leave ourselves some fudge space to write over * capacity. */ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. * * Run a random number of times less than zo_maxloops and ensure we do * not run out of space on the pool. */ while (ztest_random(10) != 0 && numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); /* * Open and close the pool and dataset to induce log replay. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; txg_wait_synced(spa_get_dsl(spa), 0); ztest_dataset_close(0); ztest_reguid(NULL, 0); spa_close(spa, FTAG); kernel_fini(); } static void ztest_import_impl(ztest_shared_t *zs) { importargs_t args = { 0 }; nvlist_t *cfg = NULL; int nsearch = 1; char *searchdirs[nsearch]; int flags = ZFS_IMPORT_MISSING_LOG; searchdirs[0] = ztest_opts.zo_dir; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_FALSE; VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, &libzpool_config_ops)); VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); fnvlist_free(cfg); } /* * Import a storage pool with the given name. */ static void ztest_import(ztest_shared_t *zs) { spa_t *spa; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(zs); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } /* * Kick off threads to run tests on all datasets in parallel. */ static void ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; kthread_t *resume_thread, *deadman_thread; kthread_t **run_threads; uint64_t object; int error; int t, d; ztest_exiting = B_FALSE; /* * Initialize parent/child shared state. */ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); zs->zs_thread_start = gethrtime(); zs->zs_thread_stop = zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; if (ztest_random(100) < ztest_opts.zo_killrate) { zs->zs_thread_kill -= ztest_random(ztest_opts.zo_passtime * NANOSEC); } mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); /* * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { VERIFY3S(error, ==, ENOENT); ztest_import_impl(zs); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; } metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; VERIFY0(vdev_raidz_impl_set("cycle")); dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, B_TRUE, FTAG); /* * Create a thread to periodically resume suspended I/O. */ resume_thread = thread_create(NULL, 0, ztest_resume_thread, spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); /* * Create a deadman thread and set to panic if we hang. */ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; /* * Verify that we can safely inquire about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ for (t = 0; t < 64; t++) { for (d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || error == EINVAL); } } /* * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; /* * If we were in the middle of ztest_device_removal() and were killed * we need to ensure the removal and scrub complete before running * any tests that check ztest_device_removal_active. The removal will * be restarted automatically when the spa is opened, but we need to * initiate the scrub manually if it is not already in progress. Note * that we always run the scrub whenever an indirect vdev exists * because we have no way of knowing for sure if ztest_device_removal() * fully completed its scrub before the pool was reimported. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Replay all logs of all datasets in the pool. This is primarily for * temporary datasets which wouldn't otherwise get replayed, which * can trigger failures when attempting to offline a SLOG in * ztest_fault_inject(). */ (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, NULL, DS_FIND_CHILDREN); /* * Kick off all the tests that run in parallel. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); return; } run_threads[t] = thread_create(NULL, 0, ztest_thread, (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ for (t = 0; t < ztest_opts.zo_threads; t++) VERIFY0(thread_join(run_threads[t])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ for (object = 1; object < 50; object++) { dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, ZIO_PRIORITY_SYNC_READ); } /* Verify that at least one commit cb was called in a timely fashion */ if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) VERIFY0(zc_min_txg_delay); spa_close(spa, FTAG); /* * Verify that we can loop over all pools. */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); /* * Verify that we can export the pool and reimport it under a * different name. */ if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { char name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s_import", ztest_opts.zo_pool); ztest_spa_import_export(ztest_opts.zo_pool, name); ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); list_destroy(&zcl.zcl_callbacks); mutex_destroy(&zcl.zcl_callbacks_lock); (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; hrtime_t h = m / 60; hrtime_t d = h / 24; s -= m * 60; m -= h * 60; h -= d * 24; timebuf[0] = '\0'; if (d) (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) (void) sprintf(timebuf, "%llum%02llus", m, s); else (void) sprintf(timebuf, "%llus", s); } static nvlist_t * make_random_props(void) { nvlist_t *props; props = fnvlist_alloc(); if (ztest_random(2) == 0) return (props); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); return (props); } /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. */ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; int i; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* * Create the storage pool. */ (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* * We don't expect the pool to suspend unless maxfaults == 0, * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); for (i = 0; i < SPA_FEATURES; i++) { char *buf; + if (!spa_feature_table[i].fi_zfs_mod_supported) + continue; + /* * 75% chance of using the log space map feature. We want ztest * to exercise both the code paths that use the log space map * feature and the ones that don't. */ if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); free(buf); } VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void setup_data_fd(void) { static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; ztest_fd_data = mkstemp(ztest_name_data); ASSERT3S(ztest_fd_data, >=, 0); (void) unlink(ztest_name_data); } static int shared_data_size(ztest_shared_hdr_t *hdr) { int size; size = hdr->zh_hdr_size; size += hdr->zh_opts_size; size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; return (size); } static void setup_hdr(void) { int size; ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); hdr->zh_size = sizeof (ztest_shared_t); hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } static void setup_data(void) { int size, offset; ztest_shared_hdr_t *hdr; uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; ztest_shared_opts = (void *)&buf[offset]; offset += hdr->zh_opts_size; ztest_shared = (void *)&buf[offset]; offset += hdr->zh_size; ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; } static boolean_t exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } if (pid == -1) fatal(1, "fork failed"); if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; char fd_data_str[12]; struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) close(ztest_fd_rand); VERIFY3S(11, >=, snprintf(fd_data_str, 12, "%d", ztest_fd_data)); VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); (void) execv(cmd, emptyargv); ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); } if (cmdbuf != NULL) { umem_free(cmdbuf, MAXPATHLEN); cmd = NULL; } while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) *statusp = status; if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { (void) fprintf(stderr, "child exited with code %d\n", WEXITSTATUS(status)); exit(2); } return (B_FALSE); } else if (WIFSIGNALED(status)) { if (!ignorekill || WTERMSIG(status) != SIGKILL) { (void) fprintf(stderr, "child died with signal %d\n", WTERMSIG(status)); exit(3); } return (B_TRUE); } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); /* NOTREACHED */ } } static void ztest_run_init(void) { int i; ztest_shared_t *zs = ztest_shared; /* * Blow away any existing copy of zpool.cache */ (void) remove(spa_config_path); if (ztest_opts.zo_init == 0) { if (ztest_opts.zo_verbose >= 1) (void) printf("Importing pool %s\n", ztest_opts.zo_pool); ztest_import(zs); return; } /* * Create and initialize our storage pool. */ for (i = 1; i <= ztest_opts.zo_init; i++) { bzero(zs, sizeof (ztest_shared_t)); if (ztest_opts.zo_verbose >= 3 && ztest_opts.zo_init != 1) { (void) printf("ztest_init(), pass %d\n", i); } ztest_init(zs); } } int main(int argc, char **argv) { int kills = 0; int iters = 0; int older = 0; int newer = 0; ztest_shared_t *zs; ztest_info_t *zi; ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; int f, err; char *fd_data_str = getenv("ZTEST_FD_DATA"); struct sigaction action; (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some * of them so the feature get tested. */ zfs_force_some_double_word_sm_entries = B_TRUE; /* * Verify that even extensively damaged split blocks with many * segments can be reconstructed in a reasonable amount of time * when reconstruction is known to be possible. * * Note: the lower this value is, the more damage we inflict, and * the more time ztest spends in recovering that damage. We chose * to induce damage 1/100th of the time so recovery is tested but * not so frequently that ztest doesn't get to test other code paths. */ zfs_reconstruct_indirect_damage_fraction = 100; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } /* * Force random_get_bytes() to use /dev/urandom in order to prevent * ztest from needlessly depleting the system entropy pool. */ random_path = "/dev/urandom"; ztest_fd_rand = open(random_path, O_RDONLY); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { process_options(argc, argv); setup_data_fd(); setup_hdr(); setup_data(); bcopy(&ztest_opts, ztest_shared_opts, sizeof (*ztest_shared_opts)); } else { ztest_fd_data = atoi(fd_data_str); setup_data(); bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); err = ztest_set_global_vars(); if (err != 0 && !fd_data_str) { /* error message done by ztest_set_global_vars */ exit(EXIT_FAILURE); } else { /* children should not be spawned if setting gvars fails */ VERIFY3S(err, ==, 0); } /* Override location of zpool.cache */ VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; if (fd_data_str) { metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; if (zs->zs_do_init) ztest_run_init(); else ztest_run(zs); exit(0); } hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { if (ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest for " "initialization: %s\n", ztest_opts.zo_alt_ztest); } VERIFY(!exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_FALSE, NULL)); } else { VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } zs->zs_do_init = B_FALSE; zs->zs_proc_start = gethrtime(); zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) zc->zc_next = UINT64_MAX; else zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ while (gethrtime() < zs->zs_proc_stop) { int status; boolean_t killed; /* * Initialize the workload counters for each function. */ for (f = 0; f < ZTEST_FUNCS; f++) { zc = ZTEST_GET_SHARED_CALLSTATE(f); zc->zc_count = 0; zc->zc_time = 0; } /* Set the allocation switch size */ zs->zs_metaslab_df_alloc_threshold = ztest_random(zs->zs_metaslab_sz / 4) + 1; if (!hasalt || ztest_random(2) == 0) { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing newer ztest: %s\n", cmd); } newer++; killed = exec_child(cmd, NULL, B_TRUE, &status); } else { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest: %s\n", ztest_opts.zo_alt_ztest); } older++; killed = exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_TRUE, &status); } if (killed) kills++; iters++; if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); (void) printf("Pass %3d, %8s, %3llu ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", (u_longlong_t)zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / (ztest_opts.zo_time * NANOSEC), timebuf); } if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); (void) printf("%7llu %9s %s\n", (u_longlong_t)zc->zc_count, timebuf, zi->zi_funcname); } (void) printf("\n"); } if (!ztest_opts.zo_mmp_test) ztest_run_zdb(ztest_opts.zo_pool); } if (ztest_opts.zo_verbose >= 1) { if (hasalt) { (void) printf("%d runs of older ztest: %s\n", older, ztest_opts.zo_alt_ztest); (void) printf("%d runs of newer ztest: %s\n", newer, cmd); } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } umem_free(cmd, MAXNAMELEN); return (0); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c index fadae9388ac1..5e5cb5f5d440 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c @@ -1,529 +1,531 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2021, Colm Buckley */ /* * This file contains the functions which analyze the status of a pool. This * include both the status of an active pool, as well as the status exported * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of * the pool. This status is independent (to a certain degree) from the state of * the pool. A pool's state describes only whether or not it is capable of * providing the necessary fault tolerance for data. The status describes the * overall status of devices. A pool that is online can still have a device * that is experiencing errors. * * Only a subset of the possible faults can be detected using 'zpool status', * and not all possible errors correspond to a FMA message ID. The explanation * is left up to the caller, depending on whether it is a live pool or an * import. */ #include #include #include #include #include #include #include "libzfs_impl.h" #include "zfeature_common.h" /* * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines * in include/libzfs.h. Note that there are some status results which go past * the end of this table, and hence have no associated message ID. */ static char *zfs_msgid_table[] = { "ZFS-8000-14", /* ZPOOL_STATUS_CORRUPT_CACHE */ "ZFS-8000-2Q", /* ZPOOL_STATUS_MISSING_DEV_R */ "ZFS-8000-3C", /* ZPOOL_STATUS_MISSING_DEV_NR */ "ZFS-8000-4J", /* ZPOOL_STATUS_CORRUPT_LABEL_R */ "ZFS-8000-5E", /* ZPOOL_STATUS_CORRUPT_LABEL_NR */ "ZFS-8000-6X", /* ZPOOL_STATUS_BAD_GUID_SUM */ "ZFS-8000-72", /* ZPOOL_STATUS_CORRUPT_POOL */ "ZFS-8000-8A", /* ZPOOL_STATUS_CORRUPT_DATA */ "ZFS-8000-9P", /* ZPOOL_STATUS_FAILING_DEV */ "ZFS-8000-A5", /* ZPOOL_STATUS_VERSION_NEWER */ "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_MISMATCH */ "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_ACTIVE */ "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_REQUIRED */ "ZFS-8000-HC", /* ZPOOL_STATUS_IO_FAILURE_WAIT */ "ZFS-8000-JQ", /* ZPOOL_STATUS_IO_FAILURE_CONTINUE */ "ZFS-8000-MM", /* ZPOOL_STATUS_IO_FAILURE_MMP */ "ZFS-8000-K4", /* ZPOOL_STATUS_BAD_LOG */ "ZFS-8000-ER", /* ZPOOL_STATUS_ERRATA */ /* * The following results have no message ID. * ZPOOL_STATUS_UNSUP_FEAT_READ * ZPOOL_STATUS_UNSUP_FEAT_WRITE * ZPOOL_STATUS_FAULTED_DEV_R * ZPOOL_STATUS_FAULTED_DEV_NR * ZPOOL_STATUS_VERSION_OLDER * ZPOOL_STATUS_FEAT_DISABLED * ZPOOL_STATUS_RESILVERING * ZPOOL_STATUS_OFFLINE_DEV * ZPOOL_STATUS_REMOVED_DEV * ZPOOL_STATUS_REBUILDING * ZPOOL_STATUS_REBUILD_SCRUB * ZPOOL_STATUS_COMPATIBILITY_ERR * ZPOOL_STATUS_OK */ }; #define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) /* ARGSUSED */ static int vdev_missing(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_OPEN_FAILED); } /* ARGSUSED */ static int vdev_faulted(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_FAULTED); } /* ARGSUSED */ static int vdev_errors(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_DEGRADED || vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || vs->vs_checksum_errors != 0); } /* ARGSUSED */ static int vdev_broken(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_CANT_OPEN); } /* ARGSUSED */ static int vdev_offlined(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_OFFLINE); } /* ARGSUSED */ static int vdev_removed(vdev_stat_t *vs, uint_t vsc) { return (vs->vs_state == VDEV_STATE_REMOVED); } static int vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) { if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL) return (0); return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift); } /* * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), boolean_t ignore_replacing) { nvlist_t **child; vdev_stat_t *vs; uint_t c, vsc, children; /* * Ignore problems within a 'replacing' vdev, since we're presumably in * the process of repairing any such errors, and don't want to call them * out again. We'll pick up the fact that a resilver is happening * later. */ if (ignore_replacing == B_TRUE) { char *type; verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_REPLACING) == 0) return (B_FALSE); } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } else { verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); if (func(vs, vsc) != 0) return (B_TRUE); } /* * Check any L2 cache devs */ if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } return (B_FALSE); } /* * Active pool health status. * * To determine the status for a pool, we make several passes over the config, * picking the most egregious error we find. In order of importance, we do the * following: * * - Check for a complete and valid configuration * - Look for any faulted or missing devices in a non-replicated config * - Check for any data errors * - Check for any faulted or missing devices in a replicated config * - Look for any devices showing errors * - Check for any resilvering or rebuilding devices * * There can obviously be multiple errors within a single pool, so this routine * only picks the most damaging of all the current errors to report. */ static zpool_status_t check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap, const char *compat) { nvlist_t *nvroot; vdev_stat_t *vs; pool_scan_stat_t *ps = NULL; uint_t vsc, psc; uint64_t nerr; uint64_t version; uint64_t stateval; uint64_t suspended; uint64_t hostid = 0; uint64_t errata = 0; unsigned long system_hostid = get_system_hostid(); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &stateval) == 0); /* * Currently resilvering a vdev */ (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER && ps->pss_state == DSS_SCANNING) return (ZPOOL_STATUS_RESILVERING); /* * Currently rebuilding a vdev, check top-level vdevs. */ vdev_rebuild_stat_t *vrs = NULL; nvlist_t **child; uint_t c, i, children; uint64_t rebuild_end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) { uint64_t state = vrs->vrs_state; if (state == VDEV_REBUILD_ACTIVE) { return (ZPOOL_STATUS_REBUILDING); } else if (state == VDEV_REBUILD_COMPLETE && vrs->vrs_end_time > rebuild_end_time) { rebuild_end_time = vrs->vrs_end_time; } } } /* * If we can determine when the last scrub was run, and it * was before the last rebuild completed, then recommend * that the pool be scrubbed to verify all checksums. When * ps is NULL we can infer the pool has never been scrubbed. */ if (rebuild_end_time > 0) { if (ps != NULL) { if ((ps->pss_state == DSS_FINISHED && ps->pss_func == POOL_SCAN_SCRUB && rebuild_end_time > ps->pss_end_time) || ps->pss_state == DSS_NONE) return (ZPOOL_STATUS_REBUILD_SCRUB); } else { return (ZPOOL_STATUS_REBUILD_SCRUB); } } } /* * The multihost property is set and the pool may be active. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_ACTIVE) { mmp_state_t mmp_state; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) return (ZPOOL_STATUS_HOSTID_ACTIVE); else if (mmp_state == MMP_STATE_NO_HOSTID) return (ZPOOL_STATUS_HOSTID_REQUIRED); else return (ZPOOL_STATUS_HOSTID_MISMATCH); } /* * Pool last accessed by another system. */ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (hostid != 0 && (unsigned long)hostid != system_hostid && stateval == POOL_STATE_ACTIVE) return (ZPOOL_STATUS_HOSTID_MISMATCH); /* * Newer on-disk version. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_VERSION_NEWER) return (ZPOOL_STATUS_VERSION_NEWER); /* * Unsupported feature(s). */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { nvlist_t *nvinfo; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); return (ZPOOL_STATUS_UNSUP_FEAT_READ); } /* * Check that the config is complete. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) return (ZPOOL_STATUS_BAD_GUID_SUM); /* * Check whether the pool has suspended. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED, &suspended) == 0) { uint64_t reason; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED_REASON, &reason) == 0 && reason == ZIO_SUSPEND_MMP) return (ZPOOL_STATUS_IO_FAILURE_MMP); if (suspended == ZIO_FAILURE_MODE_CONTINUE) return (ZPOOL_STATUS_IO_FAILURE_CONTINUE); return (ZPOOL_STATUS_IO_FAILURE_WAIT); } /* * Could not read a log. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_BAD_LOG) { return (ZPOOL_STATUS_BAD_LOG); } /* * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* * Corrupted pool metadata */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && vs->vs_aux == VDEV_AUX_CORRUPT_DATA) return (ZPOOL_STATUS_CORRUPT_POOL); /* * Persistent data errors. */ if (!isimport) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0 && nerr != 0) return (ZPOOL_STATUS_CORRUPT_DATA); } /* * Missing devices in a replicated config. */ if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); /* * Suboptimal, but usable, ashift configuration. */ if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); /* * Informational errata available. */ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRATA, &errata); if (errata) { *erratap = errata; return (ZPOOL_STATUS_ERRATA); } /* * Outdated, but usable, version */ if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) return (ZPOOL_STATUS_VERSION_OLDER); /* * Usable pool with disabled features */ if (version >= SPA_VERSION_FEATURES) { int i; nvlist_t *feat; if (isimport) { feat = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(feat, ZPOOL_CONFIG_ENABLED_FEAT)) feat = fnvlist_lookup_nvlist(feat, ZPOOL_CONFIG_ENABLED_FEAT); } else { feat = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS); } /* check against all features, or limited set? */ boolean_t pool_features[SPA_FEATURES]; if (zpool_load_compat(compat, pool_features, NULL, NULL) != ZPOOL_COMPATIBILITY_OK) return (ZPOOL_STATUS_COMPATIBILITY_ERR); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; + if (!fi->fi_zfs_mod_supported) + continue; if (pool_features[i] && !nvlist_exists(feat, fi->fi_guid)) return (ZPOOL_STATUS_FEAT_DISABLED); } } return (ZPOOL_STATUS_OK); } zpool_status_t zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) { /* * pass in the desired feature set, as * it affects check for disabled features */ char compatibility[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compatibility, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compatibility[0] = '\0'; zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata, compatibility); if (msgid != NULL) { if (ret >= NMSGID) *msgid = NULL; else *msgid = zfs_msgid_table[ret]; } return (ret); } zpool_status_t zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) { zpool_status_t ret = check_status(config, B_TRUE, errata, NULL); if (ret >= NMSGID) *msgid = NULL; else *msgid = zfs_msgid_table[ret]; return (ret); } diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index e95a85e89ba2..fc0e09605eef 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -1,609 +1,611 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #ifndef _KERNEL #include #include #include #endif #include #include #include #include #include #include #include "zfeature_common.h" /* * Set to disable all feature checks while opening pools, allowing pools with * unsupported features to be opened. Set for testing only. */ boolean_t zfeature_checks_disable = B_FALSE; zfeature_info_t spa_feature_table[SPA_FEATURES]; /* * Valid characters for feature guids. This list is mainly for aesthetic * purposes and could be expanded in the future. There are different allowed * characters in the guids reverse dns portion (before the colon) and its * short name (after the colon). */ static int valid_char(char c, boolean_t after_colon) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (after_colon && c == '_') || (!after_colon && (c == '.' || c == '-'))); } /* * Every feature guid must contain exactly one colon which separates a reverse * dns organization name from the feature's "short" name (e.g. * "com.company:feature_name"). */ boolean_t zfeature_is_valid_guid(const char *name) { int i; boolean_t has_colon = B_FALSE; i = 0; while (name[i] != '\0') { char c = name[i++]; if (c == ':') { if (has_colon) return (B_FALSE); has_colon = B_TRUE; continue; } if (!valid_char(c, has_colon)) return (B_FALSE); } return (has_colon); } boolean_t zfeature_is_supported(const char *guid) { if (zfeature_checks_disable) return (B_TRUE); for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; + if (!feature->fi_zfs_mod_supported) + continue; if (strcmp(guid, feature->fi_guid) == 0) return (B_TRUE); } return (B_FALSE); } int zfeature_lookup_guid(const char *guid, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(guid, feature->fi_guid) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } int zfeature_lookup_name(const char *name, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(name, feature->fi_uname) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { zfeature_info_t *feature = &spa_feature_table[fid]; for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { if (feature->fi_depends[i] == check) return (B_TRUE); } return (B_FALSE); } static boolean_t deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature) { for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++) if (deps[i] == feature) return (B_TRUE); return (B_FALSE); } #if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) static boolean_t zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs) { boolean_t supported = B_FALSE; char *path; int len = asprintf(&path, "%s%s%s%s%s", sysfs, scope == NULL ? "" : "/", scope == NULL ? "" : scope, name == NULL ? "" : "/", name == NULL ? "" : name); if (len > 0) { struct stat64 statbuf; supported = !!(stat64(path, &statbuf) == 0); free(path); } return (supported); } boolean_t zfs_mod_supported(const char *scope, const char *name) { boolean_t supported; /* * Check both the primary and alternate sysfs locations to determine * if the required functionality is supported. */ supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) || zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR)); /* * For backwards compatibility with kernel modules that predate * supported feature/property checking. Report the feature/property * as supported if the kernel module is loaded but the requested * scope directory does not exist. */ if (supported == B_FALSE) { struct stat64 statbuf; if ((stat64(ZFS_SYSFS_DIR, &statbuf) == 0) && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR) && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR)) { supported = B_TRUE; } } return (supported); } #endif static boolean_t zfs_mod_supported_feature(const char *name) { /* * The zfs module spa_feature_table[], whether in-kernel or in * libzpool, always supports all the features. libzfs needs to * query the running module, via sysfs, to determine which * features are supported. * * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. Therefore, we return * that all features except edonr are supported. */ #if defined(__FreeBSD__) if (strcmp(name, "org.illumos:edonr") == 0) return (B_FALSE); else return (B_TRUE); #elif defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name)); #endif } static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, zfeature_flags_t flags, zfeature_type_t type, const spa_feature_t *deps) { zfeature_info_t *feature = &spa_feature_table[fid]; static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; ASSERT(name != NULL); ASSERT(desc != NULL); ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || (flags & ZFEATURE_FLAG_MOS) == 0); ASSERT3U(fid, <, SPA_FEATURES); ASSERT(zfeature_is_valid_guid(guid)); if (deps == NULL) deps = nodeps; VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) || (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET))); feature->fi_feature = fid; feature->fi_guid = guid; feature->fi_uname = name; feature->fi_desc = desc; feature->fi_flags = flags; feature->fi_type = type; feature->fi_depends = deps; feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid); } /* * Every feature has a GUID of the form com.example:feature_name. The * reversed DNS name ensures that the feature's GUID is unique across all ZFS * implementations. This allows companies to independently develop and * release features. Examples include org.delphix and org.datto. Previously, * features developed on one implementation have used that implementation's * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs * domain name is recommended for new features which are developed by the * OpenZFS community and its platforms. This domain may optionally be used by * companies developing features for initial release through an OpenZFS * implementation. Use of the org.openzfs domain requires reserving the * feature name in advance with the OpenZFS project. */ void zpool_feature_init(void) { zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", "Destroy filesystems asynchronously.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", "Snapshots use less space.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", "LZ4 compression algorithm support.", ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", "Crash dumps to multiple vdev pools.", 0, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_ENABLED_TXG, "com.delphix:enabled_txg", "enabled_txg", "Record txg at which a feature is enabled", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_HOLE_BIRTH, "com.delphix:hole_birth", "hole_birth", "Retain hole birth txg for more precise zfs send", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, hole_birth_deps); } zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, "com.delphix:zpool_checkpoint", "zpool_checkpoint", "Pool state can be checkpointed, allowing rewind later.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_SPACEMAP_V2, "com.delphix:spacemap_v2", "spacemap_v2", "Space maps representing large segments are more efficient.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", 0, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t bookmarks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARKS, "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, bookmarks_deps); } { static const spa_feature_t filesystem_limits_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", "Filesystem and snapshot limits.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, filesystem_limits_deps); } zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t livelist_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LIVELIST, "com.delphix:livelist", "livelist", "Improved clone deletion performance.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, livelist_deps); } { static const spa_feature_t log_spacemap_deps[] = { SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LOG_SPACEMAP, "com.delphix:log_spacemap", "log_spacemap", "Log metaslab changes on a single spacemap and " "flush them periodically.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, log_spacemap_deps); } { static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_BLOCKS, "org.open-zfs:large_blocks", "large_blocks", "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_blocks_deps); } { static const spa_feature_t large_dnode_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_DNODE, "org.zfsonlinux:large_dnode", "large_dnode", "Variable on-disk size of dnodes.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_dnode_deps); } { static const spa_feature_t sha512_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SHA512, "org.illumos:sha512", "sha512", "SHA-512/256 hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, sha512_deps); } { static const spa_feature_t skein_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SKEIN, "org.illumos:skein", "skein", "Skein hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, skein_deps); } { static const spa_feature_t edonr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_EDONR, "org.illumos:edonr", "edonr", "Edon-R hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, edonr_deps); } { static const spa_feature_t redact_books_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, "com.delphix:redaction_bookmarks", "redaction_bookmarks", "Support for bookmarks which store redaction lists for zfs " "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, redact_books_deps); } { static const spa_feature_t redact_datasets_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTED_DATASETS, "com.delphix:redacted_datasets", "redacted_datasets", "Support for " "redacted datasets, produced by receiving a redacted zfs send " "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, redact_datasets_deps); } { static const spa_feature_t bookmark_written_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, "com.delphix:bookmark_written", "bookmark_written", "Additional accounting, enabling the written# property" "(space written since a bookmark), and estimates of send stream " "sizes for incrementals from bookmarks.", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps); } zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", "Top-level vdevs can be removed, reducing logical pool size.", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t obsolete_counts_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, "com.delphix:obsolete_counts", "obsolete_counts", "Reduce memory used by removed devices when their blocks are " "freed or remapped.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, obsolete_counts_deps); } { static const spa_feature_t userobj_accounting_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING, "org.zfsonlinux:userobj_accounting", "userobj_accounting", "User/Group object accounting.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps); } { static const spa_feature_t bookmark_v2_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_V2, "com.datto:bookmark_v2", "bookmark_v2", "Support for larger bookmarks", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps); } { static const spa_feature_t encryption_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ENCRYPTION, "com.datto:encryption", "encryption", "Support for dataset level encryption", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, encryption_deps); } { static const spa_feature_t project_quota_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_PROJECT_QUOTA, "org.zfsonlinux:project_quota", "project_quota", "space/object accounting based on project ID.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, project_quota_deps); } zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, "org.zfsonlinux:allocation_classes", "allocation_classes", "Support for separate allocation classes.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_RESILVER_DEFER, "com.datto:resilver_defer", "resilver_defer", "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { static const spa_feature_t zstd_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, "org.freebsd:zstd_compress", "zstd_compress", "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); } zfeature_register(SPA_FEATURE_DRAID, "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) EXPORT_SYMBOL(zfeature_lookup_guid); EXPORT_SYMBOL(zfeature_lookup_name); EXPORT_SYMBOL(zfeature_is_supported); EXPORT_SYMBOL(zfeature_is_valid_guid); EXPORT_SYMBOL(zfeature_depends_on); EXPORT_SYMBOL(zpool_feature_init); EXPORT_SYMBOL(spa_feature_table); #endif