diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 67ec23d471f4..0b820a39a2e8 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1,8793 +1,8830 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_SCRUB, HELP_RESILVER, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-f] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [--discard] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-f] [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-f] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-nps] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]" - "[-T d|u] [pool] ... \n" + return (gettext("\tstatus [-c [script1,script2,...]] " + "[-gLpPsvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); } abort(); /* NOTREACHED */ } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(5).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to get_vdev_spec for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child; uint_t l2children, c; char *vname; boolean_t hadcache = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; char c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "nps")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } } zpool_close(zhp); return (ret); } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; struct stat st; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(vdev, argv[0], sizeof (vdev)); if (vdev[0] != '/' && stat(vdev, &st) != 0) { int error; error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat(vdev, &st) != 0)) { (void) fprintf(stderr, gettext( "failed to find device %s, try specifying absolute " "path instead\n"), argv[0]); return (1); } } if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("failed to open %s: %s\n"), vdev, strerror(errno)); return (1); } if (ioctl(fd, BLKFLSBUF) != 0) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0 || config == NULL) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\"\n"), vdev, zpool_pool_state_to_name(state), name); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once * we get the nvlist back from get_vdev_spec(), we either print out the contents * (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_all_pool_feat = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_all_pool_feat = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_all_pool_feat = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to get_vdev_spec for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); ret = 0; } else { /* * Hand off to libzfs. */ spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); /* * Only features contained in props will be enabled: * remove from the nvlist every ZFS_FEATURE_DISABLED * value and add every missing ZFS_FEATURE_ENABLED if * enable_all_pool_feat is set. */ if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); } else if (enable_all_pool_feat) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { char *name; nvlist_t **child; uint_t c, children; int ret; name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; + boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; + boolean_t cb_print_slow_ios; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static int is_blank_str(char *str) { while (str != NULL && *str != '\0') { if (!isblank(*str)) return (0); str++; } return (1); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) { vdev_cmd_data_t *data; int i, j; char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) printf(" "); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) printf(" "); val = data->lines[j]; printf("%s", val ? val : ""); } break; } } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare) { nvlist_t **child, *root; uint_t c, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; char *type; char *path = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } (void) printf("\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + if (cb->cb_literal) { + printf(" %5llu %5llu %5llu", + (u_longlong_t)vs->vs_read_errors, + (u_longlong_t)vs->vs_write_errors, + (u_longlong_t)vs->vs_checksum_errors); + } else { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, + sizeof (cbuf)); + printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (cb->cb_print_slow_ios) { + if (children == 0) { + /* Only leafs vdevs have slow IOs */ + zfs_nicenum(vs->vs_slow_ios, rbuf, + sizeof (rbuf)); + } else { + snprintf(rbuf, sizeof (rbuf), "-"); + } + + if (cb->cb_literal) + printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + else + printf(" %5s", rbuf); + } + } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" was %s", path); } else if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static void show_import(nvlist_t *config) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; uint64_t hostid = 0; char *msgid; char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" status: One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext(" status: One or more devices contains " "corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext(" status: One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext(" status: The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext(" status: The pool is formatted using a " "legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" status: The pool is formatted using an " "incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext(" status: Some supported features are " "not enabled on the pool.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool uses the following " "feature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_HOSTID_ACTIVE: (void) printf(gettext(" status: The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" status: The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext(" status: One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext(" status: An intent log record cannot be " "read.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext(" status: One or more devices were being " "resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: (void) printf(gettext(" status: Errata #%d detected.\n"), errata); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("action: The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("action: The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%lx)\n\tbefore it " "can be safely imported.\n"), hostname, (unsigned long) hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), msgid); (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; zpool_handle_t *zhp; char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%lx)\nExport the pool on the other system, " "then run 'zpool import'.\n"), name, hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { char *hostname = ""; uint64_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%lx) at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, (unsigned long)hostid, ctime((time_t *)×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS) { ret = zfs_crypto_attempt_load_keys(g_zfs, name); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (ret); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {0, 0, 0, 0} }; discard = B_FALSE; while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) err = (zpool_discard_checkpoint(zhp) != 0); else err = (zpool_checkpoint(zhp) != 0); zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] * * -c Read pool information from a cachefile instead of searching * devices. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); if (searchdirs != NULL) free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } found_config = NULL; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir; envdup = strdup(env); dir = strtok(envdup, ":"); while (dir != NULL) { if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = dir; dir = strtok(NULL, ":"); } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ err = 0; elem = NULL; first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, policy) == 0); if (argc == 0) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { show_import(config); } } else if (searchname != NULL) { char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, searchname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), searchname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == searchguid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (argc != 0 && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : argv[1], mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (argc == 0 && first) (void) fprintf(stderr, gettext("no pools available to import\n")); error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; char **cb_vdev_names; /* Only show these vdevs */ unsigned int cb_vdev_names_count; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub"}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; unsigned int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width)/columns - slen/columns); printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf("%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else printf(" %*s", column_size, buf); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (i = 0; i < ARRAY_SIZE(names); i++) { val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; calcvs = safe_malloc(sizeof (*calcvs)); if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdev_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdev_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, oldnv, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } int get_namewidth(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *config, *nvroot; int columns; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (!cb->cb_verbose) cb->cb_namewidth = MAX(poolname_len, cb->cb_namewidth); else cb->cb_namewidth = MAX(poolname_len, max_width(zhp, nvroot, 0, cb->cb_namewidth, cb->cb_name_flags)); } /* * The width must be at least 10, but may be as large as the * column width - 42 so that we can still fit in one line. */ columns = get_columns(); if (cb->cb_namewidth < 10) cb->cb_namewidth = 10; if (cb->cb_namewidth > columns - 42) cb->cb_namewidth = columns - 42; return (0); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); if (strcmp(name, cb->cb_vdev_names[0]) == 0) ret = 1; /* match */ free(name); return (ret); } /* * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_vdev_names for a second... */ tmp_name = cb->cb_vdev_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_vdev_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_vdev_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void fsleep(float sec) { struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, "-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { sprintf(fullpath, "%s/%s", dirpath, ent->d_name); /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(char *subcommand) { char *dir, *sp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; dir = strtok(sp, ":"); while (dir != NULL) { print_zpool_dir_scripts(dir); dir = strtok(NULL, ":"); } free(sp); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { /* All the args are vdevs */ cb.cb_vdev_names = argv; cb.cb_vdev_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { /* ...and the rest are vdev names */ cb.cb_vdev_names = argv + 1; cb.cb_vdev_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdev_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdev_names, cb.cb_vdev_names_count, cb.cb_name_flags); } else { cb.vcdl = NULL; } /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdev_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; char *dashes = "%-*s - - - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel, format); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { char *bias = NULL; char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } } /* * Generic callback function to list a pool. */ int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; nvlist_t *config; nvlist_t *nvroot; config = zpool_get_config(zhp, NULL); if (cbp->cb_verbose) { config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); } if (cbp->cb_verbose) cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags); print_pool(zhp, cbp); if (cbp->cb_verbose) print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-f] * * -f Force attach, even if appears to be in use. * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-f] [-o property=value] * * -f Force attach, even if appears to be in use. * -o Set property=value. * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "et")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (fault) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; int cb_argc; char **cb_argv; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } /* * zpool scrub [-s | -p] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "sp")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (cb.cb_type == POOL_SCAN_NONE && cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { (void) fprintf(stderr, gettext("invalid option combination: " "-s and -p are mutually exclusive\n")); usage(B_FALSE); } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; cb.cb_argc = argc; cb.cb_argv = argv; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } /* * Print out detailed scrub status. */ static void print_scan_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t total_secs_left; uint64_t elapsed, secs_left, mins_left, hours_left, days_left; uint64_t pass_scanned, scanned, pass_issued, issued, total; uint64_t scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; char srate_buf[7], irate_buf[7]; (void) printf(gettext(" scan: ")); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { total_secs_left = end - start; days_left = total_secs_left / 60 / 60 / 24; hours_left = (total_secs_left / 60 / 60) % 24; mins_left = (total_secs_left / 60) % 60; secs_left = (total_secs_left % 60); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " "in %llu days %02llu:%02llu:%02llu " "with %llu errors on %s"), processed_buf, (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " "in %llu days %02llu:%02llu:%02llu " "with %llu errors on %s"), processed_buf, (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; total_secs_left = (issue_rate != 0) ? ((total - issued) / issue_rate) : UINT64_MAX; days_left = total_secs_left / 60 / 60 / 24; hours_left = (total_secs_left / 60 / 60) % 24; mins_left = (total_secs_left / 60) % 60; secs_left = (total_secs_left % 60); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total, total_buf, sizeof (total_buf)); zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); /* do not print estimated time if we have a paused scrub */ if (pause == 0) { (void) printf(gettext("\t%s scanned at %s/s, " "%s issued at %s/s, %s total\n"), scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { if (issue_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %llu days " "%02llu:%02llu:%02llu to go\n"), (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * As we don't scrub checkpointed blocks, we want to warn the * user that we skipped scanning some blocks if a checkpoint exists * or existed at any time during the scan. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); (void) printf(gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext(" %s copied out of %s at %s/s, " "%.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext(" %s memory used for " "removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: http://zfsonlinux.org/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); (void) printf(gettext(" state: %s\n"), health); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: (void) printf(gettext("status: One or more devices could not " "be opened. Sufficient replicas exist for\n\tthe pool to " "continue functioning in a degraded state.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: (void) printf(gettext("status: One or more devices could not " "be opened. There are insufficient\n\treplicas for the " "pool to continue functioning.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); (void) printf(gettext("action: Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: (void) printf(gettext("status: One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); (void) printf(gettext("action: Determine if the device needs " "to be replaced, and clear the errors\n\tusing " "'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext("status: One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: (void) printf(gettext("status: One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); (void) printf(gettext("action: Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf(gettext("status: One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); (void) printf(gettext("action: Restore the file in question " "if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext("status: The pool metadata is corrupted " "and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext("status: The pool is formatted using a " "legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); (void) printf(gettext("action: Upgrade the pool using 'zpool " "upgrade'. Once this is done, the\n\tpool will no longer " "be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext("status: The pool has been upgraded to a " "newer, incompatible on-disk version.\n\tThe pool cannot " "be accessed on this system.\n")); (void) printf(gettext("action: Access the pool from a system " "running more recent software, or\n\trestore the pool from " "backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext("status: Some supported features are not " "enabled on the pool. The pool can\n\tstill be used, but " "some features are unavailable.\n")); (void) printf(gettext("action: Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(5) for details.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool cannot be accessed on " "this system because it uses the\n\tfollowing feature(s) " "not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: Access the pool from a system " "that supports the required feature(s),\n\tor restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: The pool cannot be accessed in " "read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); (void) printf(gettext("action: Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); (void) printf(gettext("action: Destroy and re-create the pool " "from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: (void) printf(gettext("status: The pool is suspended because " "multihost writes failed or were delayed;\n\tanother " "system could import the pool undetected.\n")); (void) printf(gettext("action: Make sure the pool's devices " "are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: (void) printf(gettext("status: One or more devices are " "faulted in response to IO failures.\n")); (void) printf(gettext("action: Make sure the affected devices " "are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext("status: An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); (void) printf(gettext("action: Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext("status: Mismatch between pool hostid " "and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); (void) printf(gettext("action: Export this pool on all systems " "on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: (void) printf(gettext("status: Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext("action: To correct the issue " "run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); (void) printf(gettext("action: To correct the issue " "backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), msgid); if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_scan_status(ps); print_checkpoint_scan_warning(ps, pcs); print_removal_status(zhp, prs); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); + if (cbp->cb_print_slow_ios) + (void) printf(" %5s", gettext("SLOW")); + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* - * zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ... + * zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. + * -p Display values in parsable (exact) format. * -P Display full path for vdev name. + * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ - while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) { + while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; + case 'p': + cb.cb_literal = B_TRUE; + break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; + case 's': + cb.cb_print_slow_ios = B_TRUE; + break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!nvlist_exists(enabled, fguid)) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t printnl = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); printnl = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; printnl = B_TRUE; } } if (printnl) { (void) printf(gettext("\n")); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(5) for " "details.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { printnl = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { printnl = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported features enabled.\n"), zpool_get_name(zhp)); } } if (printnl) { (void) printf(gettext("\n")); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported features " "enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; int ret, i; hist_cbdata_t *cb = (hist_cbdata_t *)data; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); if ((ret = zpool_get_history(zhp, &nvhis)) != 0) return (ret); verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[30] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } (void) printf("\n"); nvlist_free(nvhis); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32], *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(ev_opts_t *opts) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(&opts); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, set_callback, &cb); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 513d0cf5ec1c..9bfb123c76fe 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -1,120 +1,121 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H #ifdef __cplusplus extern "C" { #endif #define ZFS_ERROR_CLASS "fs.zfs" #define FM_EREPORT_ZFS_CHECKSUM "checksum" #define FM_EREPORT_ZFS_AUTHENTICATION "authentication" #define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" #define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" #define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" #define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" #define FM_EREPORT_ZFS_IO_FAILURE "io_failure" #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" #define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" #define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" #define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" #define FM_EREPORT_PAYLOAD_ZFS_POOL_STATE "pool_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH "vdev_physpath" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE "vdev_laststate" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_FAILMODE_WAIT "wait" #define FM_EREPORT_FAILMODE_CONTINUE "continue" #define FM_EREPORT_FAILMODE_PANIC "panic" #define FM_RESOURCE_REMOVED "removed" #define FM_RESOURCE_AUTOREPLACE "autoreplace" #define FM_RESOURCE_STATECHANGE "statechange" #ifdef __cplusplus } #endif #endif /* _SYS_FM_FS_ZFS_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4f0e13dfdc37..05b7685f509c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,1380 +1,1384 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4) } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_ENCRYPTION, ZFS_PROP_KEYLOCATION, ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, ZFS_PROP_PBKDF2_ITERS, ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_PROP_USEROBJUSED, ZFS_PROP_USEROBJQUOTA, ZFS_PROP_GROUPOBJUSED, ZFS_PROP_GROUPOBJQUOTA, ZFS_PROP_PROJECTUSED, ZFS_PROP_PROJECTQUOTA, ZFS_PROP_PROJECTOBJUSED, ZFS_PROP_PROJECTOBJQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zpool_prop.c. */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_visible(zfs_prop_t prop); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); boolean_t zfs_prop_encryption_key_param(zfs_prop_t); boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); boolean_t zfs_prop_written(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_setonce(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST } zfs_redundant_metadata_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, ZFS_KEYSTATUS_AVAILABLE, } zfs_keystatus_t; typedef enum zfs_keyformat { ZFS_KEYFORMAT_NONE = 0, ZFS_KEYFORMAT_RAW, ZFS_KEYFORMAT_HEX, ZFS_KEYFORMAT_PASSPHRASE, ZFS_KEYFORMAT_FORMATS } zfs_keyformat_t; typedef enum zfs_key_location { ZFS_KEYLOCATION_NONE = 0, ZFS_KEYLOCATION_PROMPT, ZFS_KEYLOCATION_URI, ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.open-zfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" /* Active queue read/write stats */ #define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" #define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" +/* Number of slow IOs */ +#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" + /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; /* * Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering * of this enum must be maintained to ensure the errata identifiers map to * the correct documentation. New errata may only be appended to the list * and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ + uint64_t vs_slow_ios; /* slow IOs */ } vdev_stat_t; /* * Extended stats * * These are stats which aren't included in the original iostat output. For * convenience, they are grouped together in vdev_stat_ex, although each stat * is individually exported as an nvlist. */ typedef struct vdev_stat_ex { /* Number of ZIOs issued to disk and waiting to finish */ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* Number of ZIOs pending to be issued to disk */ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* * Below are the histograms for various latencies. Buckets are in * units of nanoseconds. */ /* * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in * before this. */ #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; /* Total ZIO latency (ns). Includes queuing and disk access time */ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* Amount of time to read/write the disk (ns) */ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* "lookup the bucket for a value" histogram macros */ #define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ buckets - 1) : 0) #define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) #define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) /* Physical IO histogram */ uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; /* Delegated (aggregated) physical IO histogram */ uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; } vdev_stat_ex_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_SHARETAB "/etc/dfs/sharetab" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ #define ZVOL_DIR "/dev" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. * * These numbers cannot change over time. New ioctl numbers must be appended. */ typedef enum zfs_ioc { /* * Illumos - 71/128 numbers reserved. */ ZFS_IOC_FIRST = ('Z' << 8), ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ ZFS_IOC_POOL_STATS, /* 0x5a05 */ ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ ZFS_IOC_POOL_SCAN, /* 0x5a07 */ ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ ZFS_IOC_VDEV_ADD, /* 0x5a0b */ ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ ZFS_IOC_SET_PROP, /* 0x5a16 */ ZFS_IOC_CREATE, /* 0x5a17 */ ZFS_IOC_DESTROY, /* 0x5a18 */ ZFS_IOC_ROLLBACK, /* 0x5a19 */ ZFS_IOC_RENAME, /* 0x5a1a */ ZFS_IOC_RECV, /* 0x5a1b */ ZFS_IOC_SEND, /* 0x5a1c */ ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ ZFS_IOC_ERROR_LOG, /* 0x5a20 */ ZFS_IOC_CLEAR, /* 0x5a21 */ ZFS_IOC_PROMOTE, /* 0x5a22 */ ZFS_IOC_SNAPSHOT, /* 0x5a23 */ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ ZFS_IOC_SET_FSACL, /* 0x5a28 */ ZFS_IOC_GET_FSACL, /* 0x5a29 */ ZFS_IOC_SHARE, /* 0x5a2a */ ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ ZFS_IOC_SMB_ACL, /* 0x5a2c */ ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ ZFS_IOC_HOLD, /* 0x5a30 */ ZFS_IOC_RELEASE, /* 0x5a31 */ ZFS_IOC_GET_HOLDS, /* 0x5a32 */ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ ZFS_IOC_DIFF, /* 0x5a36 */ ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ ZFS_IOC_POOL_REGUID, /* 0x5a3c */ ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ ZFS_IOC_SEND_NEW, /* 0x5a40 */ ZFS_IOC_SEND_SPACE, /* 0x5a41 */ ZFS_IOC_CLONE, /* 0x5a42 */ ZFS_IOC_BOOKMARK, /* 0x5a43 */ ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a46 */ ZFS_IOC_RECV_NEW, /* 0x5a47 */ ZFS_IOC_POOL_SYNC, /* 0x5a48 */ ZFS_IOC_LOAD_KEY, /* 0x5a49 */ ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ /* * Linux - 3/64 numbers reserved. */ ZFS_IOC_LINUX = ('Z' << 8) + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x5a81 */ ZFS_IOC_EVENTS_CLEAR, /* 0x5a82 */ ZFS_IOC_EVENTS_SEEK, /* 0x5a83 */ /* * FreeBSD - 1/64 numbers reserved. */ ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0, ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_IOC_CMD_UNAVAIL, ZFS_ERR_IOC_ARG_UNAVAIL, ZFS_ERR_IOC_ARG_REQUIRED, ZFS_ERR_IOC_ARG_BADTYPE, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" /* * Special nvlist name that will not have its args recorded in the pool's * history log. */ #define ZPOOL_HIDDEN_ARGS "hidden_args" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 #define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index ca657ad70fbb..5dc27e3349be 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1,1140 +1,1142 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * General-purpose 32-bit and 64-bit bitfield encodings. */ #define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) #define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) #define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) #define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) #define BF32_GET(x, low, len) BF32_DECODE(x, low, len) #define BF64_GET(x, low, len) BF64_DECODE(x, low, len) #define BF32_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1U << (len)); \ ASSERT3U(low + len, <=, 32); \ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1ULL << (len)); \ ASSERT3U(low + len, <=, 64); \ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ _NOTE(CONSTCOND) } while (0) #define BF32_GET_SB(x, low, len, shift, bias) \ ((BF32_GET(x, low, len) + (bias)) << (shift)) #define BF64_GET_SB(x, low, len, shift, bias) \ ((BF64_GET(x, low, len) + (bias)) << (shift)) #define BF32_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * Note that although blocks up to 16MB are supported, the recordsize * property can not be set larger than zfs_max_recordsize (default 1MB). * See the comment near zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) \ ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) #define BP_SET_FILL(bp, fill) \ { \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 /* * Controls the behavior of spa_vdev_remove(). */ #define SPA_REMOVE_UNSPARE 0x01 #define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern int zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *private; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; spa_history_kstat_t io_history; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...); extern void spa_load_note(spa_t *spa, const char *fmt, ...); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); -extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t length); +extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd, + zio_t *zio); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* vdev cache */ extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ extern int zfs_deadman_enabled; extern unsigned long zfs_deadman_synctime_ms; extern unsigned long zfs_deadman_ziotime_ms; extern unsigned long zfs_deadman_checktime_ms; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 3220066494db..4b7ad3e227e3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -1,685 +1,680 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1 << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 /* supported encryption algorithms */ enum zio_encrypt { ZIO_CRYPT_INHERIT = 0, ZIO_CRYPT_ON, ZIO_CRYPT_OFF, ZIO_CRYPT_AES_128_CCM, ZIO_CRYPT_AES_192_CCM, ZIO_CRYPT_AES_256_CCM, ZIO_CRYPT_AES_128_GCM, ZIO_CRYPT_AES_192_GCM, ZIO_CRYPT_AES_256_GCM, ZIO_CRYPT_FUNCTIONS }; #define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_CCM #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 #define ZIO_DATA_SALT_LEN 8 #define ZIO_DATA_MAC_LEN 16 /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_GZIP_1 || \ (compress) == ZIO_COMPRESS_GZIP_2 || \ (compress) == ZIO_COMPRESS_GZIP_3 || \ (compress) == ZIO_COMPRESS_GZIP_4 || \ (compress) == ZIO_COMPRESS_GZIP_5 || \ (compress) == ZIO_COMPRESS_GZIP_6 || \ (compress) == ZIO_COMPRESS_GZIP_7 || \ (compress) == ZIO_COMPRESS_GZIP_8 || \ (compress) == ZIO_COMPRESS_GZIP_9 || \ (compress) == ZIO_COMPRESS_ZLE || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) -/* - * Default Linux timeout for a sd device. - */ -#define ZIO_DELAY_MAX (30 * MILLISEC) - #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 typedef enum zio_suspend_reason { ZIO_SUSPEND_NONE = 0, ZIO_SUSPEND_IOERR, ZIO_SUSPEND_MMP, } zio_suspend_reason_t; enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ ZIO_FLAG_DONT_AGGREGATE = 1 << 0, ZIO_FLAG_IO_REPAIR = 1 << 1, ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, ZIO_FLAG_PHYSICAL = 1 << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ ZIO_FLAG_SPECULATIVE = 1 << 8, ZIO_FLAG_CONFIG_WRITER = 1 << 9, ZIO_FLAG_DONT_RETRY = 1 << 10, ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, ZIO_FLAG_IO_ALLOCATING = 1 << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ ZIO_FLAG_PROBE = 1 << 16, ZIO_FLAG_TRYHARD = 1 << 17, ZIO_FLAG_OPTIONAL = 1 << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ ZIO_FLAG_DONT_PROPAGATE = 1 << 20, ZIO_FLAG_IO_BYPASS = 1 << 21, ZIO_FLAG_IO_REWRITE = 1 << 22, ZIO_FLAG_RAW_COMPRESS = 1 << 23, ZIO_FLAG_RAW_ENCRYPT = 1 << 24, ZIO_FLAG_GANG_CHILD = 1 << 25, ZIO_FLAG_DDT_CHILD = 1 << 26, ZIO_FLAG_GODFATHER = 1 << 27, ZIO_FLAG_NOPWRITE = 1 << 28, ZIO_FLAG_REEXECUTED = 1 << 29, ZIO_FLAG_DELEGATED = 1 << 30, ZIO_FLAG_FASTWRITE = 1 << 31, }; #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) #define ZIO_CHILD_BIT(x) (1 << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; #define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) #define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; /* * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent * graveyard) to indicate checksum errors and fragmentation. */ #define ECKSUM EBADE #define EFRAGS EBADR /* Similar for ENOACTIVE */ #define ENOTACTIVE ENOANO typedef void zio_done_func_t(zio_t *zio); extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). */ struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; }; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const abd_t *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, void *arg); zio_vsd_cksum_report_f zio_vsd_default_cksum_report; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; zio_vsd_cksum_report_f *vsd_cksum_report; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, struct abd *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 typedef struct zio_alloc_list { list_t zal_list; uint64_t zal_size; } zio_alloc_list_t; typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* io_lsize != io_orig_size iff this is a raw write */ uint64_t io_lsize; /* Data represented by this I/O */ struct abd *io_abd; struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ enum zio_flag io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; }; extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, char *tag); extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ -extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5 index fc2a391f83fc..7e9bbedafdad 100644 --- a/man/man5/zfs-events.5 +++ b/man/man5/zfs-events.5 @@ -1,965 +1,965 @@ '\" te .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Portions Copyright 2018 by Richard Elling .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .TH ZFS-EVENTS 5 "Oct 24, 2018" .SH NAME zfs\-events \- Events created by the ZFS filesystem. .SH DESCRIPTION .sp .LP Description of the different events generated by the ZFS stack. .sp Most of these don't have any description. The events generated by ZFS have never been publicly documented. What is here is intended as a starting point to provide documentation for all possible events. .sp To view all events created since the loading of the ZFS infrastructure (i.e, "the module"), run .P .nf \fBzpool events\fR .fi .P to get a short list, and .P .nf \fBzpool events -v\fR .fi .P to get a full detail of the events and what information is available about it. .sp This man page lists the different subclasses that are issued in the case of an event. The full event name would be \fIereport.fs.zfs.SUBCLASS\fR, but we only list the last part here. .SS "EVENTS (SUBCLASS)" .sp .LP .sp .ne 2 .na \fBchecksum\fR .ad .RS 12n Issued when a checksum error has been detected. .RE .sp .ne 2 .na \fBio\fR .ad .RS 12n Issued when there is an I/O error in a vdev in the pool. .RE .sp .ne 2 .na \fBdata\fR .ad .RS 12n Issued when there have been data errors in the pool. .RE .sp .ne 2 .na \fBdeadman\fR .ad .RS 12n Issued when an I/O is determined to be "hung", this can be caused by lost completion events due to flaky hardware or drivers. See the \fBzfs_deadman_failmode\fR module option description for additional information regarding "hung" I/O detection and configuration. .RE .sp .ne 2 .na \fBdelay\fR .ad .RS 12n Issued when a completed I/O exceeds the maximum allowed time specified -by the \fBzio_delay_max\fR module option. This can be an indicator of -problems with the underlying storage device. +by the \fBzio_slow_io_ms\fR module option. This can be an indicator of +problems with the underlying storage device. The number of delay events is +ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter. .RE .sp .ne 2 .na \fBconfig.sync\fR .ad .RS 12n Issued every time a vdev change have been done to the pool. .RE .sp .ne 2 .na \fBzpool\fR .ad .RS 12n Issued when a pool cannot be imported. .RE .sp .ne 2 .na \fBzpool.destroy\fR .ad .RS 12n Issued when a pool is destroyed. .RE .sp .ne 2 .na \fBzpool.export\fR .ad .RS 12n Issued when a pool is exported. .RE .sp .ne 2 .na \fBzpool.import\fR .ad .RS 12n Issued when a pool is imported. .RE .sp .ne 2 .na \fBzpool.reguid\fR .ad .RS 12n Issued when a REGUID (new unique identifier for the pool have been regenerated) have been detected. .RE .sp .ne 2 .na \fBvdev.unknown\fR .ad .RS 12n Issued when the vdev is unknown. Such as trying to clear device errors on a vdev that have failed/been kicked from the system/pool and is no longer available. .RE .sp .ne 2 .na \fBvdev.open_failed\fR .ad .RS 12n Issued when a vdev could not be opened (because it didn't exist for example). .RE .sp .ne 2 .na \fBvdev.corrupt_data\fR .ad .RS 12n Issued when corrupt data have been detected on a vdev. .RE .sp .ne 2 .na \fBvdev.no_replicas\fR .ad .RS 12n Issued when there are no more replicas to sustain the pool. This would lead to the pool being \fIDEGRADED\fR. .RE .sp .ne 2 .na \fBvdev.bad_guid_sum\fR .ad .RS 12n Issued when a missing device in the pool have been detected. .RE .sp .ne 2 .na \fBvdev.too_small\fR .ad .RS 12n Issued when the system (kernel) have removed a device, and ZFS notices that the device isn't there any more. This is usually followed by a \fBprobe_failure\fR event. .RE .sp .ne 2 .na \fBvdev.bad_label\fR .ad .RS 12n Issued when the label is OK but invalid. .RE .sp .ne 2 .na \fBvdev.bad_ashift\fR .ad .RS 12n Issued when the ashift alignment requirement has increased. .RE .sp .ne 2 .na \fBvdev.remove\fR .ad .RS 12n Issued when a vdev is detached from a mirror (or a spare detached from a vdev where it have been used to replace a failed drive - only works if the original drive have been readded). .RE .sp .ne 2 .na \fBvdev.clear\fR .ad .RS 12n Issued when clearing device errors in a pool. Such as running \fBzpool clear\fR on a device in the pool. .RE .sp .ne 2 .na \fBvdev.check\fR .ad .RS 12n Issued when a check to see if a given vdev could be opened is started. .RE .sp .ne 2 .na \fBvdev.spare\fR .ad .RS 12n Issued when a spare have kicked in to replace a failed device. .RE .sp .ne 2 .na \fBvdev.autoexpand\fR .ad .RS 12n Issued when a vdev can be automatically expanded. .RE .sp .ne 2 .na \fBio_failure\fR .ad .RS 12n Issued when there is an I/O failure in a vdev in the pool. .RE .sp .ne 2 .na \fBprobe_failure\fR .ad .RS 12n Issued when a probe fails on a vdev. This would occur if a vdev have been kicked from the system outside of ZFS (such as the kernel have removed the device). .RE .sp .ne 2 .na \fBlog_replay\fR .ad .RS 12n Issued when the intent log cannot be replayed. The can occur in the case of a missing or damaged log device. .RE .sp .ne 2 .na \fBresilver.start\fR .ad .RS 12n Issued when a resilver is started. .RE .sp .ne 2 .na \fBresilver.finish\fR .ad .RS 12n Issued when the running resilver have finished. .RE .sp .ne 2 .na \fBscrub.start\fR .ad .RS 12n Issued when a scrub is started on a pool. .RE .sp .ne 2 .na \fBscrub.finish\fR .ad .RS 12n Issued when a pool has finished scrubbing. .RE .sp .ne 2 .na \fBscrub.abort\fR .ad .RS 12n Issued when a scrub is aborted on a pool. .RE .sp .ne 2 .na \fBscrub.resume\fR .ad .RS 12n Issued when a scrub is resumed on a pool. .RE .sp .ne 2 .na \fBscrub.paused\fR .ad .RS 12n Issued when a scrub is paused on a pool. .RE .sp .ne 2 .na \fBbootfs.vdev.attach\fR .ad .RS 12n .RE .SS "PAYLOADS" .sp .LP This is the payload (data, information) that accompanies an event. .sp For .BR zed (8), these are set to uppercase and prefixed with \fBZEVENT_\fR. .sp .ne 2 .na \fBpool\fR .ad .RS 12n Pool name. .RE .sp .ne 2 .na \fBpool_failmode\fR .ad .RS 12n Failmode - \fBwait\fR, \fBcontinue\fR or \fBpanic\fR. See .BR zpool (8) (\fIfailmode\fR property) for more information. .RE .sp .ne 2 .na \fBpool_guid\fR .ad .RS 12n The GUID of the pool. .RE .sp .ne 2 .na \fBpool_context\fR .ad .RS 12n The load state for the pool (0=none, 1=open, 2=import, 3=tryimport, 4=recover 5=error). .RE .sp .ne 2 .na \fBvdev_guid\fR .ad .RS 12n The GUID of the vdev in question (the vdev failing or operated upon with \fBzpool clear\fR etc). .RE .sp .ne 2 .na \fBvdev_type\fR .ad .RS 12n Type of vdev - \fBdisk\fR, \fBfile\fR, \fBmirror\fR etc. See .BR zpool (8) under \fBVirtual Devices\fR for more information on possible values. .RE .sp .ne 2 .na \fBvdev_path\fR .ad .RS 12n Full path of the vdev, including any \fI-partX\fR. .RE .sp .ne 2 .na \fBvdev_devid\fR .ad .RS 12n ID of vdev (if any). .RE .sp .ne 2 .na \fBvdev_fru\fR .ad .RS 12n Physical FRU location. .RE .sp .ne 2 .na \fBvdev_state\fR .ad .RS 12n State of vdev (0=uninitialized, 1=closed, 2=offline, 3=removed, 4=failed to open, 5=faulted, 6=degraded, 7=healthy). .RE .sp .ne 2 .na \fBvdev_ashift\fR .ad .RS 12n The ashift value of the vdev. .RE .sp .ne 2 .na \fBvdev_complete_ts\fR .ad .RS 12n The time the last I/O completed for the specified vdev. .RE .sp .ne 2 .na \fBvdev_delta_ts\fR .ad .RS 12n The time since the last I/O completed for the specified vdev. .RE .sp .ne 2 .na \fBvdev_spare_paths\fR .ad .RS 12n List of spares, including full path and any \fI-partX\fR. .RE .sp .ne 2 .na \fBvdev_spare_guids\fR .ad .RS 12n GUID(s) of spares. .RE .sp .ne 2 .na \fBvdev_read_errors\fR .ad .RS 12n How many read errors that have been detected on the vdev. .RE .sp .ne 2 .na \fBvdev_write_errors\fR .ad .RS 12n How many write errors that have been detected on the vdev. .RE .sp .ne 2 .na \fBvdev_cksum_errors\fR .ad .RS 12n How many checkum errors that have been detected on the vdev. .RE .sp .ne 2 .na \fBparent_guid\fR .ad .RS 12n GUID of the vdev parent. .RE .sp .ne 2 .na \fBparent_type\fR .ad .RS 12n Type of parent. See \fBvdev_type\fR. .RE .sp .ne 2 .na \fBparent_path\fR .ad .RS 12n Path of the vdev parent (if any). .RE .sp .ne 2 .na \fBparent_devid\fR .ad .RS 12n ID of the vdev parent (if any). .RE .sp .ne 2 .na \fBzio_objset\fR .ad .RS 12n The object set number for a given I/O. .RE .sp .ne 2 .na \fBzio_object\fR .ad .RS 12n The object number for a given I/O. .RE .sp .ne 2 .na \fBzio_level\fR .ad .RS 12n The indirect level for the block. Level 0 is the lowest level and includes data blocks. Values > 0 indicate metadata blocks at the appropriate level. .RE .sp .ne 2 .na \fBzio_blkid\fR .ad .RS 12n The block ID for a given I/O. .RE .sp .ne 2 .na \fBzio_err\fR .ad .RS 12n The errno for a failure when handling a given I/O. The errno is compatible with \fBerrno\fR(3) with the value for EBADE (0x34) used to indicate ZFS checksum error. .RE .sp .ne 2 .na \fBzio_offset\fR .ad .RS 12n The offset in bytes of where to write the I/O for the specified vdev. .RE .sp .ne 2 .na \fBzio_size\fR .ad .RS 12n The size in bytes of the I/O. .RE .sp .ne 2 .na \fBzio_flags\fR .ad .RS 12n The current flags describing how the I/O should be handled. See the \fBI/O FLAGS\fR section for the full list of I/O flags. .RE .sp .ne 2 .na \fBzio_stage\fR .ad .RS 12n The current stage of the I/O in the pipeline. See the \fBI/O STAGES\fR section for a full list of all the I/O stages. .RE .sp .ne 2 .na \fBzio_pipeline\fR .ad .RS 12n The valid pipeline stages for the I/O. See the \fBI/O STAGES\fR section for a full list of all the I/O stages. .RE .sp .ne 2 .na \fBzio_delay\fR .ad .RS 12n -The time in ticks (HZ) required for the block layer to service the I/O. Unlike -\fBzio_delta\fR this does not include any vdev queuing time and is therefore -solely a measure of the block layer performance. On most modern Linux systems -HZ is defined as 1000 making a tick equivalent to 1 millisecond. +The time elapsed (in nanoseconds) waiting for the block layer to complete the +I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is +therefore solely a measure of the block layer performance. .RE .sp .ne 2 .na \fBzio_timestamp\fR .ad .RS 12n The time when a given I/O was submitted. .RE .sp .ne 2 .na \fBzio_delta\fR .ad .RS 12n The time required to service a given I/O. .RE .sp .ne 2 .na \fBprev_state\fR .ad .RS 12n The previous state of the vdev. .RE .sp .ne 2 .na \fBcksum_expected\fR .ad .RS 12n The expected checksum value for the block. .RE .sp .ne 2 .na \fBcksum_actual\fR .ad .RS 12n The actual checksum value for an errant block. .RE .sp .ne 2 .na \fBcksum_algorithm\fR .ad .RS 12n Checksum algorithm used. See \fBzfs\fR(8) for more information on checksum algorithms available. .RE .sp .ne 2 .na \fBcksum_byteswap\fR .ad .RS 12n Whether or not the data is byteswapped. .RE .sp .ne 2 .na \fBbad_ranges\fR .ad .RS 12n [start, end) pairs of corruption offsets. Offsets are always aligned on a 64-bit boundary, and can include some gaps of non-corruption. (See \fBbad_ranges_min_gap\fR) .RE .sp .ne 2 .na \fBbad_ranges_min_gap\fR .ad .RS 12n In order to bound the size of the \fBbad_ranges\fR array, gaps of non-corruption less than or equal to \fBbad_ranges_min_gap\fR bytes have been merged with adjacent corruption. Always at least 8 bytes, since corruption is detected on a 64-bit word basis. .RE .sp .ne 2 .na \fBbad_range_sets\fR .ad .RS 12n This array has one element per range in \fBbad_ranges\fR. Each element contains the count of bits in that range which were clear in the good data and set in the bad data. .RE .sp .ne 2 .na \fBbad_range_clears\fR .ad .RS 12n This array has one element per range in \fBbad_ranges\fR. Each element contains the count of bits for that range which were set in the good data and clear in the bad data. .RE .sp .ne 2 .na \fBbad_set_bits\fR .ad .RS 12n If this field exists, it is an array of: (bad data & ~(good data)); that is, the bits set in the bad data which are cleared in the good data. Each element corresponds a byte whose offset is in a range in \fBbad_ranges\fR, and the array is ordered by offset. Thus, the first element is the first byte in the first \fBbad_ranges\fR range, and the last element is the last byte in the last \fBbad_ranges\fR range. .RE .sp .ne 2 .na \fBbad_cleared_bits\fR .ad .RS 12n Like \fBbad_set_bits\fR, but contains: (good data & ~(bad data)); that is, the bits set in the good data which are cleared in the bad data. .RE .sp .ne 2 .na \fBbad_set_histogram\fR .ad .RS 12n If this field exists, it is an array of counters. Each entry counts bits set in a particular bit of a big-endian uint64 type. The first entry counts bits set in the high-order bit of the first byte, the 9th byte, etc, and the last entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. This information is useful for observing a stuck bit in a parallel data path, such as IDE or parallel SCSI. .RE .sp .ne 2 .na \fBbad_cleared_histogram\fR .ad .RS 12n If this field exists, it is an array of counters. Each entry counts bit clears in a particular bit of a big-endian uint64 type. The first entry counts bits clears of the the high-order bit of the first byte, the 9th byte, etc, and the last entry counts clears of the low-order bit of the 8th byte, the 16th byte, etc. This information is useful for observing a stuck bit in a parallel data path, such as IDE or parallel SCSI. .RE .SS "I/O STAGES" .sp .LP The ZFS I/O pipeline is comprised of various stages which are defined below. The individual stages are used to construct these basic I/O operations: Read, Write, Free, Claim, and Ioctl. These stages may be set on an event to describe the life cycle of a given I/O. .TS tab(:); l l l . Stage:Bit Mask:Operations _:_:_ ZIO_STAGE_OPEN:0x00000001:RWFCI ZIO_STAGE_READ_BP_INIT:0x00000002:R---- ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W--- ZIO_STAGE_FREE_BP_INIT:0x00000008:--F-- ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF-- ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W--- ZIO_STAGE_ENCRYPT:0x00000040:-W--- ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- ZIO_STAGE_NOP_WRITE:0x00000100:-W--- ZIO_STAGE_DDT_READ_START:0x00000200:R---- ZIO_STAGE_DDT_READ_DONE:0x00000400:R---- ZIO_STAGE_DDT_WRITE:0x00000800:-W--- ZIO_STAGE_DDT_FREE:0x00001000:--F-- ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC- ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC- ZIO_STAGE_DVA_THROTTLE:0x00008000:-W--- ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W--- ZIO_STAGE_DVA_FREE:0x00020000:--F-- ZIO_STAGE_DVA_CLAIM:0x00040000:---C- ZIO_STAGE_READY:0x00080000:RWFCI ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R---- ZIO_STAGE_DONE:0x01000000:RWFCI .TE .SS "I/O FLAGS" .sp .LP Every I/O in the pipeline contains a set of flags which describe its function and are used to govern its behavior. These flags will be set in an event as an \fBzio_flags\fR payload entry. .TS tab(:); l l . Flag:Bit Mask _:_ ZIO_FLAG_DONT_AGGREGATE:0x00000001 ZIO_FLAG_IO_REPAIR:0x00000002 ZIO_FLAG_SELF_HEAL:0x00000004 ZIO_FLAG_RESILVER:0x00000008 ZIO_FLAG_SCRUB:0x00000010 ZIO_FLAG_SCAN_THREAD:0x00000020 ZIO_FLAG_PHYSICAL:0x00000040 ZIO_FLAG_CANFAIL:0x00000080 ZIO_FLAG_SPECULATIVE:0x00000100 ZIO_FLAG_CONFIG_WRITER:0x00000200 ZIO_FLAG_DONT_RETRY:0x00000400 ZIO_FLAG_DONT_CACHE:0x00000800 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 ZIO_FLAG_IO_ALLOCATING:0x00004000 ZIO_FLAG_IO_RETRY:0x00008000 ZIO_FLAG_PROBE:0x00010000 ZIO_FLAG_TRYHARD:0x00020000 ZIO_FLAG_OPTIONAL:0x00040000 ZIO_FLAG_DONT_QUEUE:0x00080000 ZIO_FLAG_DONT_PROPAGATE:0x00100000 ZIO_FLAG_IO_BYPASS:0x00200000 ZIO_FLAG_IO_REWRITE:0x00400000 ZIO_FLAG_RAW_COMPRESS:0x00800000 ZIO_FLAG_RAW_ENCRYPT:0x01000000 ZIO_FLAG_GANG_CHILD:0x02000000 ZIO_FLAG_DDT_CHILD:0x04000000 ZIO_FLAG_GODFATHER:0x08000000 ZIO_FLAG_NOPWRITE:0x10000000 ZIO_FLAG_REEXECUTED:0x20000000 ZIO_FLAG_DELEGATED:0x40000000 ZIO_FLAG_FASTWRITE:0x80000000 .TE diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index d7b53d161789..229484afc34b 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1,2890 +1,2891 @@ '\" te .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 by Delphix. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .TH ZFS-MODULE-PARAMETERS 5 "Oct 28, 2017" .SH NAME zfs\-module\-parameters \- ZFS module parameters .SH DESCRIPTION .sp .LP Description of the different parameters to the ZFS module. .SS "Module parameters" .sp .LP .sp .ne 2 .na \fBdbuf_cache_max_bytes\fR (ulong) .ad .RS 12n Maximum size in bytes of the dbuf cache. When \fB0\fR this value will default to \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size, otherwise the provided value in bytes will be used. The behavior of the dbuf cache and its associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBdbuf_metadata_cache_max_bytes\fR (ulong) .ad .RS 12n Maximum size in bytes of the metadata dbuf cache. When \fB0\fR this value will default to \fB1/2^dbuf_cache_shift\fR (1/16) of the target ARC size, otherwise the provided value in bytes will be used. The behavior of the metadata dbuf cache and its associated settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBdbuf_cache_hiwater_pct\fR (uint) .ad .RS 12n The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted directly. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBdbuf_cache_lowater_pct\fR (uint) .ad .RS 12n The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops evicting dbufs. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBdbuf_cache_shift\fR (int) .ad .RS 12n Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction of the target arc size. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBdbuf_metadata_cache_shift\fR (int) .ad .RS 12n Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR, to a log2 fraction of the target arc size. .sp Default value: \fB6\fR. .RE .sp .ne 2 .na \fBignore_hole_birth\fR (int) .ad .RS 12n When set, the hole_birth optimization will not be used, and all holes will always be sent on zfs send. Useful if you suspect your datasets are affected by a bug in hole_birth. .sp Use \fB1\fR for on (default) and \fB0\fR for off. .RE .sp .ne 2 .na \fBl2arc_feed_again\fR (int) .ad .RS 12n Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as fast as possible. .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBl2arc_feed_min_ms\fR (ulong) .ad .RS 12n Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only applicable in related situations. .sp Default value: \fB200\fR. .RE .sp .ne 2 .na \fBl2arc_feed_secs\fR (ulong) .ad .RS 12n Seconds between L2ARC writing .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBl2arc_headroom\fR (ulong) .ad .RS 12n How far through the ARC lists to search for L2ARC cacheable content, expressed as a multiplier of \fBl2arc_write_max\fR .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBl2arc_headroom_boost\fR (ulong) .ad .RS 12n Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being successfully compressed before writing. A value of 100 disables this feature. .sp Default value: \fB200\fR%. .RE .sp .ne 2 .na \fBl2arc_noprefetch\fR (int) .ad .RS 12n Do not write buffers to L2ARC if they were prefetched but not used by applications .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBl2arc_norw\fR (int) .ad .RS 12n No reads during writes .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBl2arc_write_boost\fR (ulong) .ad .RS 12n Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount while they remain cold. .sp Default value: \fB8,388,608\fR. .RE .sp .ne 2 .na \fBl2arc_write_max\fR (ulong) .ad .RS 12n Max write bytes per interval .sp Default value: \fB8,388,608\fR. .RE .sp .ne 2 .na \fBmetaslab_aliquot\fR (ulong) .ad .RS 12n Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. In normal operation, ZFS will try to write this amount of data to a top-level vdev before moving on to the next one. .sp Default value: \fB524,288\fR. .RE .sp .ne 2 .na \fBmetaslab_bias_enabled\fR (int) .ad .RS 12n Enable metaslab group biasing based on its vdev's over- or under-utilization relative to the pool. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBmetaslab_force_ganging\fR (ulong) .ad .RS 12n Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. .sp Default value: \fB16,777,217\fR. .RE .sp .ne 2 .na \fBzfs_metaslab_segment_weight_enabled\fR (int) .ad .RS 12n Enable/disable segment-based metaslab selection. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBzfs_metaslab_switch_threshold\fR (int) .ad .RS 12n When using segment-based metaslab selection, continue allocating from the active metaslab until \fBzfs_metaslab_switch_threshold\fR worth of buckets have been exhausted. .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBmetaslab_debug_load\fR (int) .ad .RS 12n Load all metaslabs during pool import. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBmetaslab_debug_unload\fR (int) .ad .RS 12n Prevent metaslabs from being unloaded. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBmetaslab_fragmentation_factor_enabled\fR (int) .ad .RS 12n Enable use of the fragmentation metric in computing metaslab weights. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBvdev_max_ms_count\fR (int) .ad .RS 12n When a vdev is added target this number of metaslabs per top-level vdev. .sp Default value: \fB200\fR. .RE .sp .ne 2 .na \fBvdev_min_ms_count\fR (int) .ad .RS 12n Minimum number of metaslabs to create in a top-level vdev. .sp Default value: \fB16\fR. .RE .sp .ne 2 .na \fBvdev_ms_count_limit\fR (int) .ad .RS 12n Practical upper limit of total metaslabs per top-level vdev. .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBmetaslab_preload_enabled\fR (int) .ad .RS 12n Enable metaslab group preloading. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBmetaslab_lba_weighting_enabled\fR (int) .ad .RS 12n Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth as is typically the case on a modern constant angular velocity disk drive. .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBspa_config_path\fR (charp) .ad .RS 12n SPA config file .sp Default value: \fB/etc/zfs/zpool.cache\fR. .RE .sp .ne 2 .na \fBspa_asize_inflation\fR (int) .ad .RS 12n Multiplication factor used to estimate actual disk consumption from the size of data being written. The default value is a worst case estimate, but lower values may be valid for a given pool depending on its configuration. Pool administrators who understand the factors involved may wish to specify a more realistic inflation factor, particularly if they operate close to quota or capacity limits. .sp Default value: \fB24\fR. .RE .sp .ne 2 .na \fBspa_load_print_vdev_tree\fR (int) .ad .RS 12n Whether to print the vdev tree in the debugging message buffer during pool import. Use 0 to disable and 1 to enable. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBspa_load_verify_data\fR (int) .ad .RS 12n Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR) import. Use 0 to disable and 1 to enable. An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is set to 0, the traversal skips non-metadata blocks. It can be toggled once the import has started to stop or start the traversal of non-metadata blocks. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBspa_load_verify_metadata\fR (int) .ad .RS 12n Whether to traverse blocks during an "extreme rewind" (\fB-X\fR) pool import. Use 0 to disable and 1 to enable. An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is set to 0, the traversal is not performed. It can be toggled once the import has started to stop or start the traversal. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBspa_load_verify_maxinflight\fR (int) .ad .RS 12n Maximum concurrent I/Os during the traversal performed during an "extreme rewind" (\fB-X\fR) pool import. .sp Default value: \fB10000\fR. .RE .sp .ne 2 .na \fBspa_slop_shift\fR (int) .ad .RS 12n Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in the pool to be consumed. This ensures that we don't run the pool completely out of space, due to unaccounted changes (e.g. to the MOS). It also limits the worst-case time to allocate space. If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return ENOSPC. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBvdev_removal_max_span\fR (int) .ad .RS 12n During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. This parameter determines the maximum span of free space (in bytes) which will be included as "unnecessary" data in a chunk of copied data. The default value here was chosen to align with \fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing regular reads (but there's no reason it has to be the same). .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfetch_array_rd_sz\fR (ulong) .ad .RS 12n If prefetching is enabled, disable prefetching for reads larger than this size. .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfetch_max_distance\fR (uint) .ad .RS 12n Max bytes to prefetch per stream (default 8MB). .sp Default value: \fB8,388,608\fR. .RE .sp .ne 2 .na \fBzfetch_max_streams\fR (uint) .ad .RS 12n Max number of streams per zfetch (prefetch streams per file). .sp Default value: \fB8\fR. .RE .sp .ne 2 .na \fBzfetch_min_sec_reap\fR (uint) .ad .RS 12n Min time before an active prefetch stream can be reclaimed .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_arc_dnode_limit\fR (ulong) .ad .RS 12n When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. This value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of the ARC meta buffers that may be used for dnodes. See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather than in response to overall demand for non-metadata. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_dnode_limit_percent\fR (ulong) .ad .RS 12n Percentage that can be consumed by dnodes of ARC meta buffers. .sp See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a higher priority if set to nonzero value. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBzfs_arc_dnode_reduce_percent\fR (ulong) .ad .RS 12n Percentage of ARC dnodes to try to scan in response to demand for non-metadata when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR. .sp Default value: \fB10\fR% of the number of dnodes in the ARC. .RE .sp .ne 2 .na \fBzfs_arc_average_blocksize\fR (int) .ad .RS 12n The ARC's buffer hash table is sized based on the assumption of an average block size of \fBzfs_arc_average_blocksize\fR (default 8K). This works out to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers. For configurations with a known larger average block size this value can be increased to reduce the memory footprint. .sp Default value: \fB8192\fR. .RE .sp .ne 2 .na \fBzfs_arc_evict_batch_limit\fR (int) .ad .RS 12n Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_arc_grow_retry\fR (int) .ad .RS 12n If set to a non zero value, it will replace the arc_grow_retry value with this value. The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before trying to resume growth after a memory pressure event. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_lotsfree_percent\fR (int) .ad .RS 12n Throttle I/O when free system memory drops below this percentage of total system memory. Setting this value to 0 will disable the throttle. .sp Default value: \fB10\fR%. .RE .sp .ne 2 .na \fBzfs_arc_max\fR (ulong) .ad .RS 12n Max arc size of ARC in bytes. If set to 0 then it will consume 1/2 of system RAM. This value must be at least 67108864 (64 megabytes). .sp This value can be changed dynamically with some caveats. It cannot be set back to 0 while running and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_meta_adjust_restarts\fR (ulong) .ad .RS 12n The number of restart passes to make while scanning the ARC attempting the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR. This value should not need to be tuned but is available to facilitate performance analysis. .sp Default value: \fB4096\fR. .RE .sp .ne 2 .na \fBzfs_arc_meta_limit\fR (ulong) .ad .RS 12n The maximum allowed size in bytes that meta data buffers are allowed to consume in the ARC. When this limit is reached meta data buffers will be reclaimed even if the overall arc_c_max has not been reached. This value defaults to 0 which indicates that a percent which is based on \fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data. .sp This value my be changed dynamically except that it cannot be set back to 0 for a specific percent of the ARC; it must be set to an explicit value. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_meta_limit_percent\fR (ulong) .ad .RS 12n Percentage of ARC buffers that can be used for meta data. See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a higher priority if set to nonzero value. .sp Default value: \fB75\fR%. .RE .sp .ne 2 .na \fBzfs_arc_meta_min\fR (ulong) .ad .RS 12n The minimum allowed size in bytes that meta data buffers may consume in the ARC. This value defaults to 0 which disables a floor on the amount of the ARC devoted meta data. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_meta_prune\fR (int) .ad .RS 12n The number of dentries and inodes to be scanned looking for entries which can be dropped. This may be required when the ARC reaches the \fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers in the ARC. Increasing this value will cause to dentry and inode caches to be pruned more aggressively. Setting this value to 0 will disable pruning the inode and dentry caches. .sp Default value: \fB10,000\fR. .RE .sp .ne 2 .na \fBzfs_arc_meta_strategy\fR (int) .ad .RS 12n Define the strategy for ARC meta data buffer eviction (meta reclaim strategy). A value of 0 (META_ONLY) will evict only the ARC meta data buffers. A value of 1 (BALANCED) indicates that additional data buffers may be evicted if that is required to in order to evict the required number of meta data buffers. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_arc_min\fR (ulong) .ad .RS 12n Min arc size of ARC in bytes. If set to 0 then arc_c_min will default to consuming the larger of 32M or 1/32 of total system memory. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_min_prefetch_ms\fR (int) .ad .RS 12n Minimum time prefetched blocks are locked in the ARC, specified in ms. A value of \fB0\fR will default to 1000 ms. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_min_prescient_prefetch_ms\fR (int) .ad .RS 12n Minimum time "prescient prefetched" blocks are locked in the ARC, specified in ms. These blocks are meant to be prefetched fairly aggresively ahead of the code that may use them. A value of \fB0\fR will default to 6000 ms. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_max_missing_tvds\fR (int) .ad .RS 12n Number of missing top-level vdevs which will be allowed during pool import (only in read-only mode). .sp Default value: \fB0\fR .RE .sp .ne 2 .na \fBzfs_multilist_num_sublists\fR (int) .ad .RS 12n To allow more fine-grained locking, each ARC state contains a series of lists for both data and meta data objects. Locking is performed at the level of these "sub-lists". This parameters controls the number of sub-lists per ARC state, and also applies to other uses of the multilist data structure. .sp Default value: \fB4\fR or the number of online CPUs, whichever is greater .RE .sp .ne 2 .na \fBzfs_arc_overflow_shift\fR (int) .ad .RS 12n The ARC size is considered to be overflowing if it exceeds the current ARC target size (arc_c) by a threshold determined by this parameter. The threshold is calculated as a fraction of arc_c using the formula "arc_c >> \fBzfs_arc_overflow_shift\fR". The default value of 8 causes the ARC to be considered to be overflowing if it exceeds the target size by 1/256th (0.3%) of the target size. When the ARC is overflowing, new buffer allocations are stalled until the reclaim thread catches up and the overflow condition no longer exists. .sp Default value: \fB8\fR. .RE .sp .ne 2 .na \fBzfs_arc_p_min_shift\fR (int) .ad .RS 12n If set to a non zero value, this will update arc_p_min_shift (default 4) with the new value. arc_p_min_shift is used to shift of arc_c for calculating both min and max max arc_p .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_p_dampener_disable\fR (int) .ad .RS 12n Disable arc_p adapt dampener .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_arc_shrink_shift\fR (int) .ad .RS 12n If set to a non zero value, this will update arc_shrink_shift (default 7) with the new value. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_arc_pc_percent\fR (uint) .ad .RS 12n Percent of pagecache to reclaim arc to This tunable allows ZFS arc to play more nicely with the kernel's LRU pagecache. It can guarantee that the arc size won't collapse under scanning pressure on the pagecache, yet still allows arc to be reclaimed down to zfs_arc_min if necessary. This value is specified as percent of pagecache size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This only operates during memory pressure/reclaim. .sp Default value: \fB0\fR% (disabled). .RE .sp .ne 2 .na \fBzfs_arc_sys_free\fR (ulong) .ad .RS 12n The target number of bytes the ARC should leave as free memory on the system. Defaults to the larger of 1/64 of physical memory or 512K. Setting this option to a non-zero value will override the default. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_autoimport_disable\fR (int) .ad .RS 12n Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR). .sp Use \fB1\fR for yes (default) and \fB0\fR for no. .RE .sp .ne 2 .na \fBzfs_checksums_per_second\fR (int) .ad .RS 12n Rate limit checksum events to this many per second. Note that this should not be set below the zed thresholds (currently 10 checksums over 10 sec) or else zed may not trigger any action. .sp Default value: 20 .RE .sp .ne 2 .na \fBzfs_commit_timeout_pct\fR (int) .ad .RS 12n This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. The timeout is scaled based on a percentage of the last lwb latency to avoid significantly impacting the latency of each individual transaction record (itx). .sp Default value: \fB5\fR%. .RE .sp .ne 2 .na \fBzfs_condense_indirect_vdevs_enable\fR (int) .ad .RS 12n Enable condensing indirect vdev mappings. When set to a non-zero value, attempt to condense indirect vdev mappings if the mapping uses more than \fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_condense_max_obsolete_bytes\fR (ulong) .ad .RS 12n Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes (see \fBfBzfs_condense_indirect_vdevs_enable\fR). .sp Default value: \fB1,073,741,824\fR. .RE .sp .ne 2 .na \fBzfs_condense_min_mapping_bytes\fR (ulong) .ad .RS 12n Minimum size vdev mapping to attempt to condense (see \fBzfs_condense_indirect_vdevs_enable\fR). .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzfs_dbgmsg_enable\fR (int) .ad .RS 12n Internally ZFS keeps a small log to facilitate debugging. By default the log is disabled, to enable it set this option to 1. The contents of the log can be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file. Writing 0 to this proc file clears the log. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_dbgmsg_maxsize\fR (int) .ad .RS 12n The maximum size in bytes of the internal ZFS debug log. .sp Default value: \fB4M\fR. .RE .sp .ne 2 .na \fBzfs_dbuf_state_index\fR (int) .ad .RS 12n This feature is currently unused. It is normally used for controlling what reporting is available under /proc/spl/kstat/zfs. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_deadman_enabled\fR (int) .ad .RS 12n When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR milliseconds, or when an individual I/O takes longer than \fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is invoked as described by the \fBzfs_deadman_failmode\fR module option. By default the deadman is enabled and configured to \fBwait\fR which results in "hung" I/Os only being logged. The deadman is automatically disabled when a pool gets suspended. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_deadman_failmode\fR (charp) .ad .RS 12n Controls the failure behavior when the deadman detects a "hung" I/O. Valid values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR. .sp \fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a "deadman" event will be posted describing that I/O. .sp \fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it to the I/O pipeline if possible. .sp \fBpanic\fR - Panic the system. This can be used to facilitate an automatic fail-over to a properly configured fail-over partner. .sp Default value: \fBwait\fR. .RE .sp .ne 2 .na \fBzfs_deadman_checktime_ms\fR (int) .ad .RS 12n Check time in milliseconds. This defines the frequency at which we check for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior. .sp Default value: \fB60,000\fR. .RE .sp .ne 2 .na \fBzfs_deadman_synctime_ms\fR (ulong) .ad .RS 12n Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes. .sp Default value: \fB600,000\fR. .RE .sp .ne 2 .na \fBzfs_deadman_ziotime_ms\fR (ulong) .ad .RS 12n Interval in milliseconds after which the deadman is triggered and an -individual IO operation is considered to be "hung". As long as the I/O +individual I/O operation is considered to be "hung". As long as the I/O remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR milliseconds until the I/O completes. .sp Default value: \fB300,000\fR. .RE .sp .ne 2 .na \fBzfs_dedup_prefetch\fR (int) .ad .RS 12n Enable prefetching dedup-ed blks .sp Use \fB1\fR for yes and \fB0\fR to disable (default). .RE .sp .ne 2 .na \fBzfs_delay_min_dirty_percent\fR (int) .ad .RS 12n Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of \fBzfs_dirty_data_max\fR. This value should be >= zfs_vdev_async_write_active_max_dirty_percent. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB60\fR%. .RE .sp .ne 2 .na \fBzfs_delay_scale\fR (int) .ad .RS 12n This controls how quickly the transaction delay approaches infinity. Larger values cause longer delays for a given amount of dirty data. .sp For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between 10x and 1/10th this number. .sp See the section "ZFS TRANSACTION DELAY". .sp Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. .sp Default value: \fB500,000\fR. .RE .sp .ne 2 .na -\fBzfs_delays_per_second\fR (int) +\fBzfs_low_ios_per_second\fR (int) .ad .RS 12n -Rate limit IO delay events to this many per second. +Rate limit delay zevents (which report slow I/Os) to this many per second. .sp Default value: 20 .RE .sp .ne 2 .na \fBzfs_delete_blocks\fR (ulong) .ad .RS 12n This is the used to define a large file for the purposes of delete. Files containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously while smaller files are deleted synchronously. Decreasing this value will reduce the time spent in an unlink(2) system call at the expense of a longer delay before the freed space is available. .sp Default value: \fB20,480\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_max\fR (int) .ad .RS 12n Determines the dirty space limit in bytes. Once this limit is exceeded, new writes are halted until space frees up. This parameter takes precedence over \fBzfs_dirty_data_max_percent\fR. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_max\fR (int) .ad .RS 12n Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. This limit is only enforced at module load time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. This parameter takes precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB25\fR% of physical RAM. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_max_percent\fR (int) .ad .RS 12n Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a percentage of physical RAM. This limit is only enforced at module load time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this one. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB25\fR%. .RE .sp .ne 2 .na \fBzfs_dirty_data_max_percent\fR (int) .ad .RS 12n Determines the dirty space limit, expressed as a percentage of all memory. Once this limit is exceeded, new writes are halted until space frees up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this one. See the section "ZFS TRANSACTION DELAY". .sp Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR. .RE .sp .ne 2 .na \fBzfs_dirty_data_sync_percent\fR (int) .ad .RS 12n Start syncing out a transaction group if there's at least this much dirty data as a percentage of \fBzfs_dirty_data_max\fR. This should be less than \fBzfs_vdev_async_write_active_min_dirty_percent\fR. .sp Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. .RE .sp .ne 2 .na \fBzfs_fletcher_4_impl\fR (string) .ad .RS 12n Select a fletcher 4 implementation. .sp Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, \fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR. All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction set extensions to be available and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR results in the original, CPU based calculation, being used. Selecting any option other than \fBfastest\fR and \fBscalar\fR results in vector instructions from the respective CPU instruction set being used. .sp Default value: \fBfastest\fR. .RE .sp .ne 2 .na \fBzfs_free_bpobj_enabled\fR (int) .ad .RS 12n Enable/disable the processing of the free_bpobj object. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_async_block_max_blocks\fR (ulong) .ad .RS 12n Maximum number of blocks freed in a single txg. .sp Default value: \fB100,000\fR. .RE .sp .ne 2 .na \fBzfs_override_estimate_recordsize\fR (ulong) .ad .RS 12n Record size calculation override for zfs send estimates. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_read_max_active\fR (int) .ad .RS 12n Maximum asynchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB3\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_read_min_active\fR (int) .ad .RS 12n Minimum asynchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) .ad .RS 12n When the pool has more than \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use \fBzfs_vdev_async_write_max_active\fR to limit active async writes. If the dirty data is between min and max, the active I/O limit is linearly interpolated. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB60\fR%. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) .ad .RS 12n When the pool has less than \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use \fBzfs_vdev_async_write_min_active\fR to limit active async writes. If the dirty data is between min and max, the active I/O limit is linearly interpolated. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB30\fR%. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_max_active\fR (int) .ad .RS 12n Maximum asynchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_async_write_min_active\fR (int) .ad .RS 12n Minimum asynchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Lower values are associated with better latency on rotational media but poorer resilver performance. The default value of 2 was chosen as a compromise. A value of 3 has been shown to improve resilver performance further at a cost of further increasing latency. .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_max_active\fR (int) .ad .RS 12n The maximum number of I/Os active to each device. Ideally, this will be >= the sum of each queue's max_active. It must be at least the sum of each queue's min_active. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scrub_max_active\fR (int) .ad .RS 12n Maximum scrub I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scrub_min_active\fR (int) .ad .RS 12n Minimum scrub I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_read_max_active\fR (int) .ad .RS 12n Maximum synchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_read_min_active\fR (int) .ad .RS 12n Minimum synchronous read I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_write_max_active\fR (int) .ad .RS 12n Maximum synchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_sync_write_min_active\fR (int) .ad .RS 12n Minimum synchronous write I/Os active to each device. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_vdev_queue_depth_pct\fR (int) .ad .RS 12n Maximum number of queued allocations per top-level vdev expressed as a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the system to detect devices that are more capable of handling allocations and to allocate more blocks to those devices. It allows for dynamic allocation distribution when devices are imbalanced as fuller devices will tend to be slower than empty devices. See also \fBzio_dva_throttle_enabled\fR. .sp Default value: \fB1000\fR%. .RE .sp .ne 2 .na \fBzfs_expire_snapshot\fR (int) .ad .RS 12n Seconds to expire .zfs/snapshot .sp Default value: \fB300\fR. .RE .sp .ne 2 .na \fBzfs_admin_snapshot\fR (int) .ad .RS 12n Allow the creation, removal, or renaming of entries in the .zfs/snapshot directory to cause the creation, destruction, or renaming of snapshots. When enabled this functionality works both locally and over NFS exports which have the 'no_root_squash' option set. This functionality is disabled by default. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_flags\fR (int) .ad .RS 12n Set additional debugging flags. The following flags may be bitwise-or'd together. .sp .TS box; rB lB lB lB r l. Value Symbolic Name Description _ 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. _ 2 ZFS_DEBUG_DBUF_VERIFY * Enable extra dbuf verifications. _ 4 ZFS_DEBUG_DNODE_VERIFY * Enable extra dnode verifications. _ 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. _ 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. _ 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. _ 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. _ 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-core range_trees. _ 512 ZFS_DEBUG_SET_ERROR Enable SET_ERROR and dprintf entries in the debug log. .TE .sp * Requires debug build. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_free_leak_on_eio\fR (int) .ad .RS 12n If destroy encounters an EIO while reading metadata (e.g. indirect blocks), space referenced by the missing metadata can not be freed. Normally this causes the background destroy to become "stalled", as it is unable to make forward progress. While in this stalled state, all remaining space to free from the error-encountering filesystem is "temporarily leaked". Set this flag to cause it to ignore the EIO, permanently leak the space from indirect blocks that can not be read, and continue to free everything else that it can. The default, "stalling" behavior is useful if the storage partially fails (i.e. some but not all i/os fail), and then later recovers. In this case, we will be able to continue pool operations while it is partially failed, and when it recovers, we can continue to free the space, with no leaks. However, note that this case is actually fairly rare. Typically pools either (a) fail completely (but perhaps temporarily, e.g. a top-level vdev going offline), or (b) have localized, permanent errors (e.g. disk returns the wrong data due to bit flip or firmware bug). In case (a), this setting does not matter because the pool will be suspended and the sync thread will not be able to make forward progress regardless. In case (b), because the error is permanent, the best we can do is leak the minimum amount of space, which is what setting this flag will do. Therefore, it is reasonable for this flag to normally be set, but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_free_min_time_ms\fR (int) .ad .RS 12n During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum of this much time will be spent working on freeing blocks per txg. .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_immediate_write_sz\fR (long) .ad .RS 12n Largest data block to write to zil. Larger blocks will be treated as if the dataset being written to had the property setting \fBlogbias=throughput\fR. .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfs_lua_max_instrlimit\fR (ulong) .ad .RS 12n The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. .sp Default value: \fB100,000,000\fR. .RE .sp .ne 2 .na \fBzfs_lua_max_memlimit\fR (ulong) .ad .RS 12n The maximum memory limit that can be set for a ZFS channel program, specified in bytes. .sp Default value: \fB104,857,600\fR. .RE .sp .ne 2 .na \fBzfs_max_dataset_nesting\fR (int) .ad .RS 12n The maximum depth of nested datasets. This value can be tuned temporarily to fix existing datasets that exceed the predefined limit. .sp Default value: \fB50\fR. .RE .sp .ne 2 .na \fBzfs_max_recordsize\fR (int) .ad .RS 12n We currently support block sizes from 512 bytes to 16MB. The benefits of -larger blocks, and thus larger IO, need to be weighed against the cost of +larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on i/o latency, and also potentially on the memory allocator. Therefore, we do not allow the recordsize to be set larger than zfs_max_recordsize (default 1MB). Larger blocks can be created by changing this tunable, and pools with larger blocks can always be imported and used, regardless of this setting. .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_metaslab_fragmentation_threshold\fR (int) .ad .RS 12n Allow metaslabs to keep their active state as long as their fragmentation percentage is less than or equal to this value. An active metaslab that exceeds this threshold will no longer keep its active status allowing better metaslabs to be selected. .sp Default value: \fB70\fR. .RE .sp .ne 2 .na \fBzfs_mg_fragmentation_threshold\fR (int) .ad .RS 12n Metaslab groups are considered eligible for allocations if their fragmentation metric (measured as a percentage) is less than or equal to this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. .sp Default value: \fB85\fR. .RE .sp .ne 2 .na \fBzfs_mg_noalloc_threshold\fR (int) .ad .RS 12n Defines a threshold at which metaslab groups should be eligible for allocations. The value is expressed as a percentage of free space beyond which a metaslab group is always eligible for allocations. If a metaslab group's free space is less than or equal to the threshold, the allocator will avoid allocating to that group unless all groups in the pool have reached the threshold. Once all groups have reached the threshold, all groups are allowed to accept allocations. The default value of 0 disables the feature and causes all metaslab groups to be eligible for allocations. This parameter allows one to deal with pools having heavily imbalanced vdevs such as would be the case when a new vdev has been added. Setting the threshold to a non-zero percentage will stop allocations from being made to vdevs that aren't filled to the specified percentage and allow lesser filled vdevs to acquire more allocations than they otherwise would under the old \fBzfs_mg_alloc_failures\fR facility. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_ddt_data_is_special\fR (int) .ad .RS 12n If enabled, ZFS will place DDT data into the special allocation class. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_user_indirect_is_special\fR (int) .ad .RS 12n If enabled, ZFS will place user data (both file and zvol) indirect blocks into the special allocation class. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_multihost_history\fR (int) .ad .RS 12n Historical statistics for the last N multihost updates will be available in \fB/proc/spl/kstat/zfs//multihost\fR .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_multihost_interval\fR (ulong) .ad .RS 12n Used to control the frequency of multihost writes which are performed when the \fBmultihost\fR pool property is on. This is one factor used to determine the length of the activity check during import. .sp The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR milliseconds. This means that on average a multihost write will be issued for each leaf vdev every \fBzfs_multihost_interval\fR milliseconds. In practice, the observed period can vary with the I/O load and this observed value is the delay which is stored in the uberblock. .sp On import the activity check waits a minimum amount of time determined by \fBzfs_multihost_interval * zfs_multihost_import_intervals\fR. The activity check time may be further extended if the value of mmp delay found in the best uberblock indicates actual multihost updates happened at longer intervals than \fBzfs_multihost_interval\fR. A minimum value of \fB100ms\fR is enforced. .sp Default value: \fB1000\fR. .RE .sp .ne 2 .na \fBzfs_multihost_import_intervals\fR (uint) .ad .RS 12n Used to control the duration of the activity test on import. Smaller values of \fBzfs_multihost_import_intervals\fR will reduce the import time but increase the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. A value of 0 is ignored and treated as if it was set to 1 .sp Default value: \fB10\fR. .RE .sp .ne 2 .na \fBzfs_multihost_fail_intervals\fR (uint) .ad .RS 12n Controls the behavior of the pool when multihost write failures are detected. .sp When \fBzfs_multihost_fail_intervals = 0\fR then multihost write failures are ignored. The failures will still be reported to the ZED which depending on its configuration may take action such as suspending the pool or offlining a device. .sp When \fBzfs_multihost_fail_intervals > 0\fR then sequential multihost write failures will cause the pool to be suspended. This occurs when \fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds have passed since the last successful multihost write. This guarantees the activity test will see multihost writes if the pool is imported. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_no_scrub_io\fR (int) .ad .RS 12n Set for no scrub I/O. This results in scrubs not actually scrubbing data and simply doing a metadata crawl of the pool instead. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_no_scrub_prefetch\fR (int) .ad .RS 12n Set to disable block prefetching for scrubs. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_nocacheflush\fR (int) .ad .RS 12n Disable cache flush operations on disks when writing. Beware, this may cause corruption if disks re-order writes. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_nopwrite_enabled\fR (int) .ad .RS 12n Enable NOP writes .sp Use \fB1\fR for yes (default) and \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_dmu_offset_next_sync\fR (int) .ad .RS 12n Enable forcing txg sync to find holes. When enabled forces ZFS to act like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which when a dnode is dirty causes txg's to be synced so that this data can be found. .sp Use \fB1\fR for yes and \fB0\fR to disable (default). .RE .sp .ne 2 .na \fBzfs_pd_bytes_max\fR (int) .ad .RS 12n The number of bytes which should be prefetched during a pool traversal (eg: \fBzfs send\fR or other data crawling operations) .sp Default value: \fB52,428,800\fR. .RE .sp .ne 2 .na \fBzfs_per_txg_dirty_frees_percent \fR (ulong) .ad .RS 12n Tunable to control percentage of dirtied blocks from frees in one TXG. After this threshold is crossed, additional dirty blocks from frees wait until the next TXG. A value of zero will disable this throttle. .sp Default value: \fB30\fR and \fB0\fR to disable. .RE .sp .ne 2 .na \fBzfs_prefetch_disable\fR (int) .ad .RS 12n This tunable disables predictive prefetch. Note that it leaves "prescient" prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, prescient prefetch never issues i/os that end up not being needed, so it can't hurt performance. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_read_chunk_size\fR (long) .ad .RS 12n Bytes to read per chunk .sp Default value: \fB1,048,576\fR. .RE .sp .ne 2 .na \fBzfs_read_history\fR (int) .ad .RS 12n Historical statistics for the last N reads will be available in \fB/proc/spl/kstat/zfs//reads\fR .sp Default value: \fB0\fR (no data is kept). .RE .sp .ne 2 .na \fBzfs_read_history_hits\fR (int) .ad .RS 12n Include cache hits in read history .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_reconstruct_indirect_combinations_max\fR (int) .ad .RS 12na If an indirect split block contains more than this many possible unique combinations when being reconstructed, consider it too computationally expensive to check them all. Instead, try at most \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected combinations each time the block is accessed. This allows all segment copies to participate fairly in the reconstruction when all combinations cannot be checked and prevents repeated use of one bad copy. .sp Default value: \fB256\fR. .RE .sp .ne 2 .na \fBzfs_recover\fR (int) .ad .RS 12n Set to attempt to recover from fatal errors. This should only be used as a last resort, as it typically results in leaked space, or worse. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_resilver_min_time_ms\fR (int) .ad .RS 12n Resilvers are processed by the sync thread. While resilvering it will spend at least this much time working on a resilver between txg flushes. .sp Default value: \fB3,000\fR. .RE .sp .ne 2 .na \fBzfs_scan_ignore_errors\fR (int) .ad .RS 12n If set to a nonzero value, remove the DTL (dirty time list) upon completion of a pool scan (scrub) even if there were unrepairable errors. It is intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scrub_min_time_ms\fR (int) .ad .RS 12n Scrubs are processed by the sync thread. While scrubbing it will spend at least this much time working on a scrub between txg flushes. .sp Default value: \fB1,000\fR. .RE .sp .ne 2 .na \fBzfs_scan_checkpoint_intval\fR (int) .ad .RS 12n To preserve progress across reboots the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verifications I/Os to disk. The frequency of this flushing is determined by the \fBzfs_scan_checkpoint_intval\fR tunable. .sp Default value: \fB7200\fR seconds (every 2 hours). .RE .sp .ne 2 .na \fBzfs_scan_fill_weight\fR (int) .ad .RS 12n This tunable affects how scrub and resilver I/O segments are ordered. A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. Changing the value afterwards will have no affect on scrub or resilver performance. .sp Default value: \fB3\fR. .RE .sp .ne 2 .na \fBzfs_scan_issue_strategy\fR (int) .ad .RS 12n Determines the order that data will be verified while scrubbing or resilvering. If set to \fB1\fR, data will be verified as sequentially as possible, given the amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This may improve scrub performance if the pool's data is very fragmented. If set to \fB2\fR, the largest mostly-contiguous chunk of found data will be verified first. By deferring scrubbing of small segments, we may later find adjacent data to coalesce and increase the segment size. If set to \fB0\fR, zfs will use strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a checkpoint. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scan_legacy\fR (int) .ad .RS 12n A value of 0 indicates that scrubs and resilvers will gather metadata in memory before issuing sequential I/O. A value of 1 indicates that the legacy algorithm will be used where I/O is initiated as soon as it is discovered. Changing this value to 0 will not affect scrubs or resilvers that are already in progress. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_scan_max_ext_gap\fR (int) .ad .RS 12n Indicates the largest gap in bytes between scrub / resilver I/Os that will still be considered sequential for sorting purposes. Changing this value will not affect scrubs or resilvers that are already in progress. .sp Default value: \fB2097152 (2 MB)\fR. .RE .sp .ne 2 .na \fBzfs_scan_mem_lim_fact\fR (int) .ad .RS 12n Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. This tunable determines the hard limit for I/O sorting memory usage. When the hard limit is reached we stop scanning metadata and start issuing data verification I/O. This is done until we get below the soft limit. .sp Default value: \fB20\fR which is 5% of RAM (1/20). .RE .sp .ne 2 .na \fBzfs_scan_mem_lim_soft_fact\fR (int) .ad .RS 12n The fraction of the hard limit used to determined the soft limit for I/O sorting by the sequential scan algorithm. When we cross this limit from bellow no action is taken. When we cross this limit from above it is because we are issuing verification I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. .sp Default value: \fB20\fR which is 5% of the hard limit (1/20). .RE .sp .ne 2 .na \fBzfs_scan_vdev_limit\fR (int) .ad .RS 12n Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. .sp Default value: \fB41943040\fR. .RE .sp .ne 2 .na \fBzfs_send_corrupt_data\fR (int) .ad .RS 12n Allow sending of corrupt data (ignore read/checksum errors when sending data) .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_send_queue_length\fR (int) .ad .RS 12n The maximum number of bytes allowed in the \fBzfs send\fR queue. This value must be at least twice the maximum block size in use. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_recv_queue_length\fR (int) .ad .RS 12n .sp The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value must be at least twice the maximum block size in use. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_deferred_free\fR (int) .ad .RS 12n Flushing of data to disk is done in passes. Defer frees starting in this pass .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_spa_discard_memory_limit\fR (int) .ad .RS 12n Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. .sp Default value: \fB16,777,216\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_dont_compress\fR (int) .ad .RS 12n Don't compress starting in this pass .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_sync_pass_rewrite\fR (int) .ad .RS 12n Rewrite new block pointers starting in this pass .sp Default value: \fB2\fR. .RE .sp .ne 2 .na \fBzfs_sync_taskq_batch_pct\fR (int) .ad .RS 12n This controls the number of threads used by the dp_sync_taskq. The default value of 75% will create a maximum of one thread per cpu. .sp Default value: \fB75\fR%. .RE .sp .ne 2 .na \fBzfs_txg_history\fR (int) .ad .RS 12n Historical statistics for the last N txgs will be available in \fB/proc/spl/kstat/zfs//txgs\fR .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_txg_timeout\fR (int) .ad .RS 12n Flush dirty data to disk at least every N seconds (maximum txg duration) .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_vdev_aggregation_limit\fR (int) .ad .RS 12n Max vdev I/O aggregation size .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzfs_vdev_cache_bshift\fR (int) .ad .RS 12n Shift size to inflate reads too .sp Default value: \fB16\fR (effectively 65536). .RE .sp .ne 2 .na \fBzfs_vdev_cache_max\fR (int) .ad .RS 12n Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR size (default 64k). .sp Default value: \fB16384\fR. .RE .sp .ne 2 .na \fBzfs_vdev_cache_size\fR (int) .ad .RS 12n Total size of the per-disk cache in bytes. .sp Currently this feature is disabled as it has been found to not be helpful for performance and in some cases harmful. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O immediately follows its predecessor on rotational vdevs for the purpose of making decisions based on load. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_seek_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O lacks locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within this that are not immediately following the previous I/O are incremented by half. .sp Default value: \fB5\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_rotating_seek_offset\fR (int) .ad .RS 12n The maximum distance for the last queued I/O in which the balancing algorithm considers an I/O to have locality. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1048576\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_non_rotating_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member on non-rotational vdevs when I/Os do not immediately follow one another. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) .ad .RS 12n A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O lacks locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within this that are not immediately following the previous I/O are incremented by half. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_vdev_read_gap_limit\fR (int) .ad .RS 12n Aggregate read I/O operations if the gap on-disk between them is within this threshold. .sp Default value: \fB32,768\fR. .RE .sp .ne 2 .na \fBzfs_vdev_scheduler\fR (charp) .ad .RS 12n Set the Linux I/O scheduler on whole disk vdevs to this scheduler. Valid options are noop, cfq, bfq & deadline .sp Default value: \fBnoop\fR. .RE .sp .ne 2 .na \fBzfs_vdev_write_gap_limit\fR (int) .ad .RS 12n Aggregate write I/O over gap .sp Default value: \fB4,096\fR. .RE .sp .ne 2 .na \fBzfs_vdev_raidz_impl\fR (string) .ad .RS 12n Parameter for selecting raidz parity implementation to use. Options marked (always) below may be selected on module load as they are supported on all systems. The remaining options may only be set after the module is loaded, as they are available only if the implementations are compiled in and supported on the running system. Once the module is loaded, the content of /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options with the currently selected one enclosed in []. Possible options are: fastest - (always) implementation selected using built-in benchmark original - (always) original raidz implementation scalar - (always) scalar raidz implementation sse2 - implementation using SSE2 instruction set (64bit x86 only) ssse3 - implementation using SSSE3 instruction set (64bit x86 only) avx2 - implementation using AVX2 instruction set (64bit x86 only) avx512f - implementation using AVX512F instruction set (64bit x86 only) avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only) aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only) aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only) .sp Default value: \fBfastest\fR. .RE .sp .ne 2 .na \fBzfs_zevent_cols\fR (int) .ad .RS 12n When zevents are logged to the console use this as the word wrap width. .sp Default value: \fB80\fR. .RE .sp .ne 2 .na \fBzfs_zevent_console\fR (int) .ad .RS 12n Log events to the console .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzfs_zevent_len_max\fR (int) .ad .RS 12n Max event queue length. A value of 0 will result in a calculated value which increases with the number of CPUs in the system (minimum 64 events). Events in the queue can be viewed with the \fBzpool events\fR command. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_maxalloc\fR (int) .ad .RS 12n The maximum number of taskq entries that are allowed to be cached. When this limit is exceeded transaction records (itxs) will be cleaned synchronously. .sp Default value: \fB1048576\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_minalloc\fR (int) .ad .RS 12n The number of taskq entries that are pre-populated when the taskq is first created and are immediately available for use. .sp Default value: \fB1024\fR. .RE .sp .ne 2 .na \fBzfs_zil_clean_taskq_nthr_pct\fR (int) .ad .RS 12n This controls the number of threads used by the dp_zil_clean_taskq. The default value of 100% will create a maximum of one thread per cpu. .sp Default value: \fB100\fR%. .RE .sp .ne 2 .na \fBzil_replay_disable\fR (int) .ad .RS 12n Disable intent logging replay. Can be disabled for recovery from corrupted ZIL .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzil_slog_bulk\fR (ulong) .ad .RS 12n Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. .sp Default value: \fB786,432\fR. .RE .sp .ne 2 .na \fBzio_decompress_fail_fraction\fR (int) .ad .RS 12n If non-zero, this value represents the denominator of the probability that zfs should induce a decompression failure. For instance, for a 5% decompression failure rate, this value should be set to 20. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na -\fBzio_delay_max\fR (int) +\fBzio_slow_io_ms\fR (int) .ad .RS 12n -A zevent will be logged if a ZIO operation takes more than N milliseconds to -complete. Note that this is only a logging facility, not a timeout on -operations. +When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to +complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow +I/O counters can be seen with "zpool status -s". + .sp Default value: \fB30,000\fR. .RE .sp .ne 2 .na \fBzio_dva_throttle_enabled\fR (int) .ad .RS 12n -Throttle block allocations in the ZIO pipeline. This allows for +Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by \fBzfs_vdev_queue_depth_pct\fR. .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzio_requeue_io_start_cut_in_line\fR (int) .ad .RS 12n Prioritize requeued I/O .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzio_taskq_batch_pct\fR (uint) .ad .RS 12n Percentage of online CPUs (or CPU cores, etc) which will run a worker thread -for IO. These workers are responsible for IO work such as compression and +for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .sp The default value of 75 was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when high compression is enabled. .sp Default value: \fB75\fR. .RE .sp .ne 2 .na \fBzvol_inhibit_dev\fR (uint) .ad .RS 12n Do not create zvol device nodes. This may slightly improve startup time on systems with a very large number of zvols. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .sp .ne 2 .na \fBzvol_major\fR (uint) .ad .RS 12n Major number for zvol block devices .sp Default value: \fB230\fR. .RE .sp .ne 2 .na \fBzvol_max_discard_blocks\fR (ulong) .ad .RS 12n Discard (aka TRIM) operations done on zvols will be done in batches of this many blocks, where block size is determined by the \fBvolblocksize\fR property of a zvol. .sp Default value: \fB16,384\fR. .RE .sp .ne 2 .na \fBzvol_prefetch_bytes\fR (uint) .ad .RS 12n When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR from the start and end of the volume. Prefetching these regions of the volume is desirable because they are likely to be accessed immediately by \fBblkid(8)\fR or by the kernel scanning for a partition table. .sp Default value: \fB131,072\fR. .RE .sp .ne 2 .na \fBzvol_request_sync\fR (uint) .ad .RS 12n When processing I/O requests for a zvol submit them synchronously. This effectively limits the queue depth to 1 for each I/O submitter. When set to 0 requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controller by \fBzvol_threads\fR. .sp Default value: \fB0\fR. .RE .sp .ne 2 .na \fBzvol_threads\fR (uint) .ad .RS 12n Max number of threads which can handle zvol I/O requests concurrently. .sp Default value: \fB32\fR. .RE .sp .ne 2 .na \fBzvol_volmode\fR (uint) .ad .RS 12n Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR. Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none). .sp Default value: \fB1\fR. .RE .sp .ne 2 .na \fBzfs_qat_disable\fR (int) .ad .RS 12n This tunable disables qat hardware acceleration for gzip compression and. AES-GCM encryption. It is available only if qat acceleration is compiled in and the qat driver is present. .sp Use \fB1\fR for yes and \fB0\fR for no (default). .RE .SH ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. The I/O scheduler determines when and in what order those operations are issued. The I/O scheduler divides operations into five I/O classes prioritized in the following order: sync read, sync write, async read, async write, and scrub/resilver. Each queue defines the minimum and maximum number of concurrent operations that may be issued to the device. In addition, the device has an aggregate maximum, \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums must not exceed the aggregate maximum. If the sum of the per-queue maximums exceeds the aggregate maximum, then the number of active I/Os may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will be issued regardless of whether all per-queue minimums have been met. .sp For many physical devices, throughput increases with the number of concurrent operations, but latency typically suffers. Further, physical devices typically have a limit at which more concurrent operations have no effect on throughput or can actually cause it to decrease. .sp The scheduler selects the next operation to issue by first looking for an I/O class whose minimum has not been satisfied. Once all are satisfied and the aggregate maximum has not been hit, the scheduler looks for classes whose maximum has not been satisfied. Iteration through the I/O classes is done in the order specified above. No further operations are issued if the aggregate maximum number of concurrent operations has been hit or if there are no operations queued for an I/O class that has not hit its maximum. Every time an I/O is queued or an operation completes, the I/O scheduler looks for new operations to issue. .sp In general, smaller max_active's will lead to lower latency of synchronous operations. Larger max_active's may lead to higher overall throughput, depending on underlying storage. .sp The ratio of the queues' max_actives determines the balance of performance between reads, writes, and scrubs. E.g., increasing \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete more quickly, but reads and writes to have higher latency and lower throughput. .sp All I/O classes have a fixed maximum number of outstanding operations except for the async write class. Asynchronous writes represent the data that is committed to stable storage during the syncing stage for transaction groups. Transaction groups enter the syncing state periodically so the number of queued async writes will quickly burst up and then bleed down to zero. Rather than servicing them as quickly as possible, the I/O scheduler changes the maximum number of active async write I/Os according to the amount of dirty data in the pool. Since both throughput and latency typically increase with the number of concurrent operations issued to physical devices, reducing the burstiness in the number of concurrent operations also stabilizes the response time of operations from other -- and in particular synchronous -- queues. In broad strokes, the I/O scheduler will issue more concurrent operations from the async write queue as there's more dirty data in the pool. .sp Async Writes .sp The number of concurrent operations issued for the async write I/O class follows a piece-wise linear function defined by a few adjustable points. .nf | o---------| <-- zfs_vdev_async_write_max_active ^ | /^ | | | / | | active | / | | I/O | / | | count | / | | | / | | |-------o | | <-- zfs_vdev_async_write_min_active 0|_______^______|_________| 0% | | 100% of zfs_dirty_data_max | | | `-- zfs_vdev_async_write_active_max_dirty_percent `--------- zfs_vdev_async_write_active_min_dirty_percent .fi Until the amount of dirty data exceeds a minimum percentage of the dirty data allowed in the pool, the I/O scheduler will limit the number of concurrent operations to the minimum. As that threshold is crossed, the number of concurrent operations issued increases linearly to the maximum at the specified maximum percentage of the dirty data allowed in the pool. .sp Ideally, the amount of dirty data on a busy pool will stay in the sloped part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the maximum percentage, this indicates that the rate of incoming data is greater than the rate that the backend storage can handle. In this case, we must further throttle incoming writes, as described in the next section. .SH ZFS TRANSACTION DELAY We delay transactions when we've determined that the backend storage isn't able to accommodate the rate of incoming writes. .sp If there is already a transaction waiting, we delay relative to when that transaction will finish waiting. This way the calculated delay time is independent of the number of threads concurrently executing transactions. .sp If we are the only waiter, wait relative to when the transaction started, rather than the current time. This credits the transaction for "time already served", e.g. reading indirect blocks. .sp The minimum time for a transaction to take is calculated as: .nf min_time = zfs_delay_scale * (dirty - min) / (max - dirty) min_time is then capped at 100 milliseconds. .fi .sp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to delay after writing at full speed has failed to keep up with the incoming write rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. .sp .nf delay 10ms +-------------------------------------------------------------*+ | *| 9ms + *+ | *| 8ms + *+ | * | 7ms + * + | * | 6ms + * + | * | 5ms + * + | * | 4ms + * + | * | 3ms + * + | * | 2ms + (midpoint) * + | | ** | 1ms + v *** + | zfs_delay_scale ----------> ******** | 0 +-------------------------------------*********----------------+ 0% <- zfs_dirty_data_max -> 100% .fi .sp Note that since the delay is added to the outstanding time remaining on the most recent transaction, the delay is effectively the inverse of IOPS. Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve was chosen such that small changes in the amount of accumulated dirty data in the first 3/4 of the curve yield relatively small differences in the amount of delay. .sp The effects can be easier to understand when the amount of delay is represented on a log scale: .sp .nf delay 100ms +-------------------------------------------------------------++ + + | | + *+ 10ms + *+ + ** + | (midpoint) ** | + | ** + 1ms + v **** + + zfs_delay_scale ----------> ***** + | **** | + **** + 100us + ** + + * + | * | + * + 10us + * + + + | | + + +--------------------------------------------------------------+ 0% <- zfs_dirty_data_max -> 100% .fi .sp Note here that only as the amount of dirty data approaches its limit does the delay start to increase rapidly. The goal of a properly tuned system should be to keep the amount of dirty data out of that range by first ensuring that the appropriate limits are set for the I/O scheduler to reach optimal throughput on the backend storage, and then by changing the value of \fBzfs_delay_scale\fR to increase the steepness of the curve. diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 8fe6b494be47..b9e0e1ad4a95 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -1,2628 +1,2636 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" .Dd April 27, 2018 .Dt ZPOOL 8 SMM .Os Linux .Sh NAME .Nm zpool .Nd configure ZFS storage pools .Sh SYNOPSIS .Nm .Fl ? .Nm .Cm add .Op Fl fgLnP .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool vdev Ns ... .Nm .Cm attach .Op Fl f .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Nm .Cm clear .Ar pool .Op Ar device .Nm .Cm create .Op Fl dfn .Op Fl m Ar mountpoint .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Oo Fl o Ar feature@feature Ns = Ns Ar value Oc .Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool vdev Ns ... .Nm .Cm destroy .Op Fl f .Ar pool .Nm .Cm detach .Ar pool device .Nm .Cm events .Op Fl vHf Oo Ar pool Oc | Fl c .Nm .Cm export .Op Fl a .Op Fl f .Ar pool Ns ... .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... .Oo Ar pool Oc Ns ... .Nm .Cm history .Op Fl il .Oo Ar pool Oc Ns ... .Nm .Cm import .Op Fl D .Op Fl d Ar dir Ns | Ns device .Nm .Cm import .Fl a .Op Fl DflmN .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc .Op Fl -rewind-to-checkpoint .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Nm .Cm import .Op Fl Dflm .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc .Op Fl -rewind-to-checkpoint .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Op Fl s .Ar pool Ns | Ns Ar id .Op Ar newpool Oo Fl t Oc .Nm .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d .Op Fl ghHLpPvy .Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc .Op Ar interval Op Ar count .Nm .Cm labelclear .Op Fl f .Ar device .Nm .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm .Cm offline .Op Fl f .Op Fl t .Ar pool Ar device Ns ... .Nm .Cm online .Op Fl e .Ar pool Ar device Ns ... .Nm .Cm reguid .Ar pool .Nm .Cm reopen .Op Fl n .Ar pool .Nm .Cm remove .Op Fl np .Ar pool Ar device Ns ... .Nm .Cm remove .Fl s .Ar pool .Nm .Cm replace .Op Fl f .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool Ar device Op Ar new_device .Nm .Cm resilver .Ar pool Ns ... .Nm .Cm scrub .Op Fl s | Fl p .Ar pool Ns ... .Nm .Cm set .Ar property Ns = Ns Ar value .Ar pool .Nm .Cm split .Op Fl gLlnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool .Oo Ar device Oc Ns ... .Nm .Cm status .Oo Fl c Ar SCRIPT Oc -.Op Fl gLPvxD +.Op Fl DgLpPsvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm .Cm sync .Oo Ar pool Oc Ns ... .Nm .Cm upgrade .Nm .Cm upgrade .Fl v .Nm .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... .Sh DESCRIPTION The .Nm command configures ZFS storage pools. A storage pool is a collection of devices that provides physical storage and data replication for ZFS datasets. All datasets within a storage pool share the same space. See .Xr zfs 8 for information on managing datasets. .Ss Virtual Devices (vdevs) A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width Ds .It Sy disk A block device, typically located under .Pa /dev . ZFS can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name .Po the relative portion of the path under .Pa /dev .Pc . A whole disk can be specified by omitting the slice or partition designation. For example, .Pa sda is equivalent to .Pa /dev/sda . When given a whole disk, ZFS automatically labels the disk, if necessary. .It Sy file A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be specified by a full path. .It Sy mirror A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices failing before data integrity is compromised. .It Sy raidz , raidz1 , raidz2 , raidz3 A variation on RAID-5 that allows for better distribution of parity and eliminates the RAID-5 .Qq write hole .Pq in which data and parity become inconsistent after a power loss . Data and parity is striped across all disks within a raidz group. .Pp A raidz group can have single-, double-, or triple-parity, meaning that the raidz group can sustain one, two, or three failures, respectively, without losing any data. The .Sy raidz1 vdev type specifies a single-parity raidz group; the .Sy raidz2 vdev type specifies a double-parity raidz group; and the .Sy raidz3 vdev type specifies a triple-parity raidz group. The .Sy raidz vdev type is an alias for .Sy raidz1 . .Pp A raidz group with N disks of size X with P parity disks can hold approximately (N-P)*X bytes and can withstand P device(s) failing before data integrity is compromised. The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy spare A special pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the .Sx Hot Spares section. .It Sy log A separate intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, raidz vdev types are not supported for the intent log. For more information, see the .Sx Intent Log section. .It Sy dedup A device dedicated solely for allocating dedup data. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one dedup device is specified, then allocations are load-balanced between devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file data. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one special device is specified, then allocations are load-balanced between devices. .Pp For more information on special allocations, see the .Sx Special Allocation Class section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. .El .Pp Virtual devices cannot be nested, so a mirror or raidz virtual device can only contain files or disks. Mirrors of mirrors .Pq or other combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as .Qq root vdevs .Pc . Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, ZFS automatically places data on the newly available devices. .Pp Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords .Sy mirror and .Sy raidz are used to distinguish where a group ends and another begins. For example, the following creates two root vdevs, each a mirror of two disks: .Bd -literal # zpool create mypool mirror sda sdb mirror sdc sdd .Ed .Ss Device Failure and Recovery ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and ZFS automatically repairs bad data from a good copy when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or raidz groups. While ZFS supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is strongly discouraged. A single case of bit corruption can render some or all of your data unavailable. .Pp A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level vdev, such as mirror or raidz device, is potentially impacted by the state of its associated vdevs, or component devices. A top-level vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" .It Sy DEGRADED One or more top-level vdevs is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. .Pp One or more component devices is in the degraded or faulted state, but sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet .It The number of checksum errors exceeds acceptable levels and the device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. The device could not be marked as faulted because there are insufficient replicas to continue functioning. .El .It Sy FAULTED One or more top-level vdevs is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. .Pp One or more component devices is in the faulted state, and insufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet .It The device could be opened, but the contents did not match expected values. .It The number of I/O errors exceeds acceptable levels and the device is faulted to prevent further use of the device. .El .It Sy OFFLINE The device was explicitly taken offline by the .Nm zpool Cm offline command. .It Sy ONLINE The device is online and functioning. .It Sy REMOVED The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .It Sy UNAVAIL The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. .El .Pp If a device is removed and later re-attached to the system, ZFS attempts to put the device online automatically. Device attach detection is hardware-dependent and might not be supported on all platforms. .Ss Hot Spares ZFS allows devices to be associated with pools as .Qq hot spares . These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Sy spare vdev with any number of devices. For example, .Bd -literal # zpool create pool mirror sda sdb spare sdc sdd .Ed .Pp Spares can be shared across multiple pools, and can be added with the .Nm zpool Cm add command and removed with the .Nm zpool Cm remove command. Once a spare replacement is initiated, a new .Sy spare vdev is created within the configuration that will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails. .Pp If a pool has a shared spare that is currently being used, the pool can not be exported since other pools may use this shared spare, which may lead to potential data corruption. .Pp An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp Spares cannot replace log devices. .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous transactions. For instance, databases often require their transactions to be on stable storage devices when returning from a system call. NFS and other applications can also use .Xr fsync 2 to ensure data stability. By default, the intent log is allocated from blocks within the main pool. However, it might be possible to get better performance using separate intent log devices such as NVRAM or a dedicated disk. For example: .Bd -literal # zpool create pool sda sdb log sdc .Ed .Pp Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp Log devices can be added, replaced, attached, detached and removed. In addition, log devices are imported and exported as part of the pool that contains them. Mirrored devices can be removed by specifying the top-level mirror vdev. .Ss Cache Devices Devices can be added to a storage pool as .Qq cache devices . These devices provide an additional layer of caching between main memory and disk. For read-heavy workloads, where the working set size is much larger than what can be cached in main memory, using cache devices allow much more of this working set to be served from low latency media. Using cache devices provides the greatest performance improvement for random read-workloads of mostly static content. .Pp To create a pool with cache devices, specify a .Sy cache vdev with any number of devices. For example: .Bd -literal # zpool create pool sda sdb cache sdc sdd .Ed .Pp Cache devices cannot be mirrored or part of a raidz configuration. If a read error is encountered on a cache device, that read I/O is reissued to the original storage pool device, which might be part of a mirrored or raidz configuration. .Pp The content of the cache devices is considered volatile, as is the case with other system caches. .Ss Pool checkpoint Before starting critical procedures that include destructive actions (e.g .Nm zfs Cm destroy ), an administrator can checkpoint the pool's state and in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. .Pp A pool checkpoint can be thought of as a pool-wide snapshot and should be used with care as it contains every part of the pool's state, from properties to vdev configuration. Thus, while a pool has a checkpoint certain operations are not allowed. Specifically, vdev removal/attach/detach, mirror splitting, and changing the pool's guid. Adding a new vdev is supported but in the case of a rewind it will have to be added again. Finally, users of this feature should keep in mind that scrubs in a pool that has a checkpoint do not repair checkpointed data. .Pp To create a checkpoint for a pool: .Bd -literal # zpool checkpoint pool .Ed .Pp To later rewind to its checkpointed state, you need to first export it and then rewind it during import: .Bd -literal # zpool export pool # zpool import --rewind-to-checkpoint pool .Ed .Pp To discard the checkpoint from a pool: .Bd -literal # zpool checkpoint -d pool .Ed .Pp Dataset reservations (controlled by the .Nm reservation or .Nm refreservation zfs properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. .Ss Special Allocation Class The allocations in the special class are dedicated to specific block types. By default this includes all metadata, the indirect blocks of user data, and any dedup data. The class can also be provisioned to accept a limited percentage of small file data blocks. .Pp A pool must always have at least one general (non-specified) vdev before other devices can be assigned to the special class. If the special class becomes full, then allocations intended for it will spill back into the normal class. .Pp Dedup data can be excluded from the special class by setting the .Sy zfs_ddt_data_is_special zfs module parameter to false (0). .Pp Inclusion of small file blocks in the special class is opt-in. Each dataset can control the size of small file blocks allowed in the special class by setting the .Sy special_small_blocks dataset property. It defaults to zero so you must opt-in by setting it to a non-zero value. See .Xr zfs 8 for more info on setting this property. .Ss Properties Each pool has several properties associated with it. Some properties are read-only statistics while others are configurable and change the behavior of the pool. .Pp The following are read-only properties: .Bl -tag -width Ds .It Cm allocated Amount of storage used within the pool. .It Sy capacity Percentage of pool space used. This property can also be referred to by its shortened column name, .Sy cap . .It Sy expandsize Amount of uninitialized space within the pool or device that can be used to increase the total capacity of the pool. Uninitialized space consists of any space on an EFI labeled vdev which has not been brought online .Po e.g, using .Nm zpool Cm online Fl e .Pc . This space occurs when a LUN is dynamically expanded. .It Sy fragmentation The amount of fragmentation in the pool. .It Sy free The amount of free space available in the pool. .It Sy freeing After a file system or snapshot is destroyed, the space it was using is returned to the pool asynchronously. .Sy freeing is the amount of space remaining to be reclaimed. Over time .Sy freeing will decrease while .Sy free increases. .It Sy health The current health of the pool. Health can be one of .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . .It Sy guid A unique identifier for the pool. .It Sy load_guid A unique identifier for the pool. Unlike the .Sy guid property, this identifier is generated every time we load the pool (e.g. does not persist across imports/exports) and never changes while the pool is loaded (even if a .Sy reguid operation takes place). .It Sy size Total size of the storage pool. .It Sy unsupported@ Ns Em feature_guid Information about unsupported features that are enabled on the pool. See .Xr zpool-features 5 for details. .El .Pp The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a raidz configuration depends on the characteristics of the data being written. In addition, ZFS reserves some space for internal accounting that the .Xr zfs 8 command takes into account, but the .Nm command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable. .Pp The following property can be set at creation time and import time: .Bl -tag -width Ds .It Sy altroot Alternate root directory. If set, this directory is prepended to any mount points within the pool. This can be used when examining an unknown pool where the mount points cannot be trusted, or in an alternate boot environment, where the typical paths are not valid. .Sy altroot is not a persistent property. It is valid only while the system is up. Setting .Sy altroot defaults to using .Sy cachefile Ns = Ns Sy none , though this may be overridden using an explicit setting. .El .Pp The following property can be set only at import time: .Bl -tag -width Ds .It Sy readonly Ns = Ns Sy on Ns | Ns Sy off If set to .Sy on , the pool will be imported in read-only mode. This property can also be referred to by its shortened column name, .Sy rdonly . .El .Pp The following properties can be set at creation time and import time, and later changed with the .Nm zpool Cm set command: .Bl -tag -width Ds .It Sy ashift Ns = Ns Sy ashift Pool sector size exponent, to the power of .Sy 2 (internally referred to as .Sy ashift ). Values from 9 to 16, inclusive, are valid; also, the special value 0 (the default) means to auto-detect using the kernel's block layer and a ZFS internal exception list. I/O operations will be aligned to the specified size boundaries. Additionally, the minimum (disk) write size will be set to the specified size, so this represents a space vs. performance trade-off. For optimal performance, the pool sector size should be greater than or equal to the sector size of the underlying disks. The typical case for setting this property is when performance is important and the underlying disks use 4KiB sectors but report 512B sectors to the OS (for compatibility reasons); in that case, set .Sy ashift=12 (which is 1<<12 = 4096). When set, this property is used as the default hint value in subsequent vdev operations (add, attach and replace). Changing this value will not modify any existing vdev, not even on disk replacement; however it can be used, for instance, to replace a dying 512B sectors disk with a newer 4KiB sectors device: this will probably result in bad performance but at the same time could prevent loss of data. .It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off Controls automatic pool expansion when the underlying LUN is grown. If set to .Sy on , the pool will be resized according to the size of the expanded device. If the device is part of a mirror or raidz then all devices within that mirror/raidz group must be expanded before the new space is made available to the pool. The default behavior is .Sy off . This property can also be referred to by its shortened column name, .Sy expand . .It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off Controls automatic device replacement. If set to .Sy off , device replacement must be initiated by the administrator by using the .Nm zpool Cm replace command. If set to .Sy on , any new device, found in the same physical location as a device that previously belonged to the pool, is automatically formatted and replaced. The default behavior is .Sy off . This property can also be referred to by its shortened column name, .Sy replace . Autoreplace can also be used with virtual disks (like device mapper) provided that you use the /dev/disk/by-vdev paths setup by vdev_id.conf. See the .Xr vdev_id 8 man page for more details. Autoreplace and autoonline require the ZFS Event Daemon be configured and running. See the .Xr zed 8 man page for more details. .It Sy bootfs Ns = Ns Sy (unset) Ns | Ns Ar pool Ns / Ns Ar dataset Identifies the default bootable dataset for the root pool. This property is expected to be set mainly by the installation and upgrade programs. Not all Linux distribution boot processes use the bootfs property. .It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none Controls the location of where the pool configuration is cached. Discovering all pools on system startup requires a cached copy of the configuration data that is stored on the root file system. All pools in this cache are automatically imported when the system boots. Some environments, such as install and clustering, need to cache this information in a different location so that pools are not automatically imported. Setting this property caches the pool configuration in a different location that can later be imported with .Nm zpool Cm import Fl c . Setting it to the special value .Sy none creates a temporary pool that is never cached, and the special value .Qq .Pq empty string uses the default location. .Pp Multiple pools can share the same cache file. Because the kernel destroys and recreates this file when pools are added and removed, care should be taken when attempting to access this file. When the last pool using a .Sy cachefile is exported or destroyed, the file will be empty. .It Sy comment Ns = Ns Ar text A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted. An administrator can provide additional information about a pool using this property. .It Sy dedupditto Ns = Ns Ar number Threshold for the number of block ditto copies. If the reference count for a deduplicated block increases above this number, a new ditto copy of this block is automatically stored. The default setting is .Sy 0 which causes no ditto copies to be created for deduplicated blocks. The minimum legal nonzero setting is .Sy 100 . .It Sy delegation Ns = Ns Sy on Ns | Ns Sy off Controls whether a non-privileged user is granted access based on the dataset permissions defined on the dataset. See .Xr zfs 8 for more information on ZFS delegated administration. .It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic Controls the system behavior in the event of catastrophic pool failure. This condition is typically a result of a loss of connectivity to the underlying storage device(s) or a failure of all devices within the pool. The behavior of such an event is determined as follows: .Bl -tag -width "continue" .It Sy wait Blocks all I/O access until the device connectivity is recovered and the errors are cleared. This is the default behavior. .It Sy continue Returns .Er EIO to any new write I/O requests but allows reads to any of the remaining healthy devices. Any write requests that have yet to be committed to disk would be blocked. .It Sy panic Prints out a message to the console and generates a system crash dump. .El .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled The value of this property is the current state of .Ar feature_name . The only valid value when setting this property is .Sy enabled which moves .Ar feature_name to the enabled state. See .Xr zpool-features 5 for details on feature states. .It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off Controls whether information about snapshots associated with this pool is output when .Nm zfs Cm list is run without the .Fl t option. The default value is .Sy off . This property can also be referred to by its shortened name, .Sy listsnaps . .It Sy multihost Ns = Ns Sy on Ns | Ns Sy off Controls whether a pool activity check should be performed during .Nm zpool Cm import . When a pool is determined to be active it cannot be imported, even with the .Fl f option. This property is intended to be used in failover configurations where multiple hosts have access to a pool on shared storage. When this property is on, periodic writes to storage occur to show the pool is in use. See .Sy zfs_multihost_interval in the .Xr zfs-module-parameters 5 man page. In order to enable this property each host must set a unique hostid. See .Xr genhostid 1 .Xr zgenhostid 8 .Xr spl-module-parameters 5 for additional details. The default value is .Sy off . .It Sy version Ns = Ns Ar version The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the .Nm zpool Cm upgrade command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. .El .Ss Subcommands All subcommands that modify state are logged persistently to the pool in their original form. .Pp The .Nm command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported: .Bl -tag -width Ds .It Xo .Nm .Fl ? .Xc Displays a help message. .It Xo .Nm .Cm add .Op Fl fgLnP .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool vdev Ns ... .Xc Adds the specified virtual devices to the given pool. The .Ar vdev specification is described in the .Sx Virtual Devices section. The behavior of the .Fl f option, and the device checks performed are described in the .Nm zpool Cm create subcommand. .Bl -tag -width Ds .It Fl f Forces use of .Ar vdev Ns s , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl g Display .Ar vdev , GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for .Ar vdev Ns s resolving all symbolic links. This can be used to look up the current block device name regardless of the /dev/disk/ path used to open it. .It Fl n Displays the configuration that would be used without actually adding the .Ar vdev Ns s . The actual pool creation can still fail due to insufficient privileges or device sharing. .It Fl P Display real paths for .Ar vdev Ns s instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Sx Properties section for a list of valid properties that can be set. The only property supported at the moment is ashift. .El .It Xo .Nm .Cm attach .Op Fl f .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Xc Attaches .Ar new_device to the existing .Ar device . The existing device cannot be part of a raidz configuration. If .Ar device is not currently part of a mirrored configuration, .Ar device automatically transforms into a two-way mirror of .Ar device and .Ar new_device . If .Ar device is part of a two-way mirror, attaching .Ar new_device creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Sx Properties section for a list of valid properties that can be set. The only property supported at the moment is ashift. .El .It Xo .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Xc Checkpoints the current state of .Ar pool , which can be later restored by .Nm zpool Cm import --rewind-to-checkpoint . The existence of a checkpoint in a pool prohibits the following .Nm zpool commands: .Cm remove , .Cm attach , .Cm detach , .Cm split , and .Cm reguid . In addition, it may break reservation boundaries if the pool lacks free space. The .Nm zpool Cm status command indicates the existence of a checkpoint or the progress of discarding a checkpoint from a pool. The .Nm zpool Cm list command reports how much space the checkpoint takes from the pool. .Bl -tag -width Ds .It Fl d, -discard Discards an existing checkpoint from .Ar pool . .El .It Xo .Nm .Cm clear .Ar pool .Op Ar device .Xc Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared. .It Xo .Nm .Cm create .Op Fl dfn .Op Fl m Ar mountpoint .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Oo Fl o Ar feature@feature Ns = Ns Ar value Oc Ns ... .Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Op Fl t Ar tname .Ar pool vdev Ns ... .Xc Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore .Pq Qq Sy _ , dash .Pq Qq Sy \&- , colon .Pq Qq Sy \&: , space .Pq Qq Sy \&\ , and period .Pq Qq Sy \&. . The pool names .Sy mirror , .Sy raidz , .Sy spare and .Sy log are reserved, as are names beginning with .Sy mirror , .Sy raidz , .Sy spare , and the pattern .Sy c[0-9] . The .Ar vdev specification is described in the .Sx Virtual Devices section. .Pp The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by ZFS. Other uses, such as having a preexisting UFS file system, can be overridden with the .Fl f option. .Pp The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless .Fl f is specified. The use of differently sized devices within a single raidz or mirror group is also flagged as an error unless .Fl f is specified. .Pp Unless the .Fl R option is specified, the default mount point is .Pa / Ns Ar pool . The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the .Fl m option. .Pp By default all supported features are enabled on the new pool unless the .Fl d option is specified. .Bl -tag -width Ds .It Fl d Do not enable any features on the new pool. Individual features can be enabled by setting their corresponding properties to .Sy enabled with the .Fl o option. See .Xr zpool-features 5 for details about feature properties. .It Fl f Forces use of .Ar vdev Ns s , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl m Ar mountpoint Sets the mount point for the root dataset. The default mount point is .Pa /pool or .Pa altroot/pool if .Ar altroot is specified. The mount point must be an absolute path, .Sy legacy , or .Sy none . For more information on dataset mount points, see .Xr zfs 8 . .It Fl n Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Sx Properties section for a list of valid properties that can be set. .It Fl o Ar feature@feature Ns = Ns Ar value Sets the given pool feature. See the .Xr zpool-features 5 section for a list of valid features that can be set. Value can be either disabled or enabled. .It Fl O Ar file-system-property Ns = Ns Ar value Sets the given file system properties in the root file system of the pool. See the .Sx Properties section of .Xr zfs 8 for a list of valid properties that can be set. .It Fl R Ar root Equivalent to .Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root .It Fl t Ar tname Sets the in-core pool name to .Sy tname while the on-disk name will be the name specified as the pool name .Sy pool . This will set the default cachefile property to none. This is intended to handle name space collisions when creating pools for other systems, such as virtual machines or physical machines whose pools live on network block devices. .El .It Xo .Nm .Cm destroy .Op Fl f .Ar pool .Xc Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool. .Bl -tag -width Ds .It Fl f Forces any active datasets contained within the pool to be unmounted. .El .It Xo .Nm .Cm detach .Ar pool device .Xc Detaches .Ar device from a mirror. The operation is refused if there are no other valid replicas of the data. If device may be re-added to the pool later on then consider the .Sy zpool offline command instead. .It Xo .Nm .Cm events .Op Fl vHf Oo Ar pool Oc | Fl c .Xc Lists all recent events generated by the ZFS kernel modules. These events are consumed by the .Xr zed 8 and used to automate administrative tasks such as replacing a failed device with a hot spare. For more information about the subclasses and event payloads that can be generated see the .Xr zfs-events 5 man page. .Bl -tag -width Ds .It Fl c Clear all previous events. .It Fl f Follow mode. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl v Print the entire payload for each event. .El .It Xo .Nm .Cm export .Op Fl a .Op Fl f .Ar pool Ns ... .Xc Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems .Pq even those of different endianness and imported as long as a sufficient number of devices are present. .Pp Before exporting the pool, all datasets within the pool are unmounted. A pool can not be exported if it has a shared spare that is currently being used. .Pp For pools to be portable, you must give the .Nm command whole disks, not just partitions, so that ZFS can label the disks with portable EFI labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks. .Bl -tag -width Ds .It Fl a Exports all pools imported on the system. .It Fl f Forcefully unmount all datasets, using the .Nm unmount Fl f command. .Pp This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption. .El .It Xo .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... .Oo Ar pool Oc Ns ... .Xc Retrieves the given list of properties .Po or all properties if .Sy all is used .Pc for the specified storage pool(s). These properties are displayed with the following fields: .Bd -literal name Name of storage pool property Property name value Property value source Property source, either 'default' or 'local'. .Ed .Pp See the .Sx Properties section for more information on the available pool properties. .Bl -tag -width Ds .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl o Ar field A comma-separated list of columns to display. .Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source is the default value. .It Fl p Display numbers in parsable (exact) values. .El .It Xo .Nm .Cm history .Op Fl il .Oo Ar pool Oc Ns ... .Xc Displays the command history of the specified pool(s) or all pools if no pool is specified. .Bl -tag -width Ds .It Fl i Displays internally logged ZFS events in addition to user initiated events. .It Fl l Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed. .El .It Xo .Nm .Cm import .Op Fl D .Op Fl d Ar dir Ns | Ns device .Xc Lists pools available to import. If the .Fl d option is not specified, this command searches for devices in .Pa /dev . The .Fl d option can be specified multiple times, and all directories are searched. If the device appears to be part of an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the vdev layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the .Nm zpool Cm destroy command, are not listed unless the .Fl D option is specified. .Pp The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available. .Bl -tag -width Ds .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Ns | Ns Ar device Uses .Ar device or searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. .It Fl D Lists destroyed pools only. .El .It Xo .Nm .Cm import .Fl a .Op Fl DflmN .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Op Fl s .Xc Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the .Nm zpool Cm destroy command, will not be imported unless the .Fl D option is specified. .Bl -tag -width Ds .It Fl a Searches for and imports all pools found. .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Ns | Ns Ar device Uses .Ar device or searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pools only. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl l Indicates that this command will request encryption keys for all encrypted datasets it attempts to mount as it is bringing the pool online. Note that if any datasets have a .Sy keylocation of .Sy prompt this command will block waiting for the keys to be entered. Without this flag encrypted datasets will be left unavailable until the keys are loaded. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl N Import the pool without mounting any file systems. .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 8 for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Sets the .Sy cachefile property to .Sy none and the .Sy altroot property to .Ar root . .It Fl -rewind-to-checkpoint Rewinds pool to the checkpointed state. Once the pool is imported with this flag there is no way to undo the rewind. All changes and data that were written after the checkpoint are lost! The only exception is when the .Sy readonly mounting option is enabled. In this case, the checkpointed state of the pool is opened and an administrator can see how the pool would look like if they were to fully rewind. .It Fl s Scan using the default search path, the libblkid cache will not be consulted. A custom search path may be specified by setting the ZPOOL_IMPORT_PATH environment variable. .It Fl X Used with the .Fl F recovery option. Determines whether extreme measures to find a valid txg should take place. This allows the pool to be rolled back to a txg which is no longer guaranteed to be consistent. Pools imported at an inconsistent txg may contain uncorrectable checksum errors. For more details about pool recovery mode, see the .Fl F option, above. WARNING: This option can be extremely hazardous to the health of your pool and should only be used as a last resort. .It Fl T Specify the txg to use for rollback. Implies .Fl FX . For more details about pool recovery mode, see the .Fl X option, above. WARNING: This option can be extremely hazardous to the health of your pool and should only be used as a last resort. .El .It Xo .Nm .Cm import .Op Fl Dflm .Op Fl F Oo Fl n Oc Oo Fl t Oc Oo Fl T Oc Oo Fl X Oc .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Op Fl s .Ar pool Ns | Ns Ar id .Op Ar newpool .Xc Imports a specific pool. A pool can be identified by its name or the numeric identifier. If .Ar newpool is specified, the pool is imported using the name .Ar newpool . Otherwise, it is imported with the same name as its exported name. .Pp If a device is removed from a system without running .Nm zpool Cm export first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state, the .Fl f option is required. .Bl -tag -width Ds .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Ns | Ns Ar device Uses .Ar device or searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pool. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl l Indicates that this command will request encryption keys for all encrypted datasets it attempts to mount as it is bringing the pool online. Note that if any datasets have a .Sy keylocation of .Sy prompt this command will block waiting for the keys to be entered. Without this flag encrypted datasets will be left unavailable until the keys are loaded. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 8 for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Sets the .Sy cachefile property to .Sy none and the .Sy altroot property to .Ar root . .It Fl s Scan using the default search path, the libblkid cache will not be consulted. A custom search path may be specified by setting the ZPOOL_IMPORT_PATH environment variable. .It Fl X Used with the .Fl F recovery option. Determines whether extreme measures to find a valid txg should take place. This allows the pool to be rolled back to a txg which is no longer guaranteed to be consistent. Pools imported at an inconsistent txg may contain uncorrectable checksum errors. For more details about pool recovery mode, see the .Fl F option, above. WARNING: This option can be extremely hazardous to the health of your pool and should only be used as a last resort. .It Fl T Specify the txg to use for rollback. Implies .Fl FX . For more details about pool recovery mode, see the .Fl X option, above. WARNING: This option can be extremely hazardous to the health of your pool and should only be used as a last resort. .It Fl t Used with .Sy newpool . Specifies that .Sy newpool is temporary. Temporary pool names last until export. Ensures that the original pool name will be used in all label updates and therefore is retained upon export. Will also set -o cachefile=none when not explicitly specified. .El .It Xo .Nm .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d .Op Fl ghHLpPvy .Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc .Op Ar interval Op Ar count .Xc Displays I/O statistics for the given pools/vdevs. You can pass in a list of pools, a pool and list of vdevs in that pool, or a list of any vdevs from any pool. If no items are specified, statistics for every pool in the system are shown. When given an .Ar interval , the statistics are printed every .Ar interval seconds until ^C is pressed. If count is specified, the command exits after count reports are printed. The first report printed is always the statistics since boot regardless of whether .Ar interval and .Ar count are passed. However, this behavior can be suppressed with the .Fl y flag. Also note that the units of .Sy K , .Sy M , .Sy G ... that are printed in the report are in base 1024. To get the raw values, use the .Fl p flag. .Bl -tag -width Ds .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... Run a script (or scripts) on each vdev and include the output as a new column in the .Nm zpool Cm iostat output. Users can run any script found in their .Pa ~/.zpool.d directory or from the system .Pa /etc/zfs/zpool.d directory. Script names containing the slash (/) character are not allowed. The default search path can be overridden by setting the ZPOOL_SCRIPTS_PATH environment variable. A privileged user can run .Fl c if they have the ZPOOL_SCRIPTS_AS_ROOT environment variable set. If a script requires the use of a privileged command, like .Xr smartctl 8 , then it's recommended you allow the user access to it in .Pa /etc/sudoers or add the user to the .Pa /etc/sudoers.d/zfs file. .Pp If .Fl c is passed without a script name, it prints a list of all scripts. .Fl c also sets verbose mode .No \&( Ns Fl v Ns No \&). .Pp Script output should be in the form of "name=value". The column name is set to "name" and the value is set to "value". Multiple lines can be used to output multiple columns. The first line of output not in the "name=value" format is displayed without a column title, and no more output after that is displayed. This can be useful for printing error messages. Blank or NULL values are printed as a '-' to make output awk-able. .Pp The following environment variables are set before running each script: .Bl -tag -width "VDEV_PATH" .It Sy VDEV_PATH Full path to the vdev .El .Bl -tag -width "VDEV_UPATH" .It Sy VDEV_UPATH Underlying path to the vdev (/dev/sd*). For use with device mapper, multipath, or partitioned vdevs. .El .Bl -tag -width "VDEV_ENC_SYSFS_PATH" .It Sy VDEV_ENC_SYSFS_PATH The sysfs path to the enclosure for the vdev (if any). .El .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Sy u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Sy d for standard date format. See .Xr date 1 . .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. .It Fl p Display numbers in parsable (exact) values. Time values are in nanoseconds. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl r Print request size histograms for the leaf ZIOs. This includes histograms of individual ZIOs ( .Ar ind ) and aggregate ZIOs ( .Ar agg ). These stats can be useful for seeing how well the ZFS IO aggregator is working. Do not confuse these request size stats with the block layer requests; it's possible ZIOs can be broken up before being sent to the block device. .It Fl v Verbose statistics Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. .It Fl y Omit statistics since boot. Normally the first line of output reports the statistics since boot. This option suppresses that first line of output. .It Fl w Display latency histograms: .Pp .Ar total_wait : Total IO time (queuing + disk IO time). .Ar disk_wait : Disk IO time (time reading/writing the disk). .Ar syncq_wait : Amount of time IO spent in synchronous priority queues. Does not include disk time. .Ar asyncq_wait : Amount of time IO spent in asynchronous priority queues. Does not include disk time. .Ar scrub : Amount of time IO spent in scrub queue. Does not include disk time. .It Fl l Include average latency statistics: .Pp .Ar total_wait : Average total IO time (queuing + disk IO time). .Ar disk_wait : Average disk IO time (time reading/writing the disk). .Ar syncq_wait : Average amount of time IO spent in synchronous priority queues. Does not include disk time. .Ar asyncq_wait : Average amount of time IO spent in asynchronous priority queues. Does not include disk time. .Ar scrub : Average queuing time in scrub queue. Does not include disk time. .It Fl q Include active queue statistics. Each priority queue has both pending ( .Ar pend ) and active ( .Ar activ ) IOs. Pending IOs are waiting to be issued to the disk, and active IOs have been issued to disk and are waiting for completion. These stats are broken out by priority queue: .Pp .Ar syncq_read/write : Current number of entries in synchronous priority queues. .Ar asyncq_read/write : Current number of entries in asynchronous priority queues. .Ar scrubq_read : Current number of entries in scrub queue. .Pp All queue statistics are instantaneous measurements of the number of entries in the queues. If you specify an interval, the measurements will be sampled from the end of the interval. .El .It Xo .Nm .Cm labelclear .Op Fl f .Ar device .Xc Removes ZFS label information from the specified .Ar device . The .Ar device must not be part of an active pool configuration. .Bl -tag -width Ds .It Fl f Treat exported or foreign devices as inactive. .El .It Xo .Nm .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc Lists the given pools along with a health status and space usage. If no .Ar pool Ns s are specified, all pools in the system are listed. When given an .Ar interval , the information is printed every .Ar interval seconds until ^C is pressed. If .Ar count is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl o Ar property Comma-separated list of properties to display. See the .Sx Properties section for a list of valid properties. The default list is .Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , .Cm capacity , dedupratio , health , altroot . .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the /dev/disk/ path used to open it. .It Fl p Display numbers in parsable .Pq exact values. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Fl u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Fl d for standard date format. See .Xr date 1 . .It Fl v Verbose statistics. Reports usage statistics for individual vdevs within the pool, in addition to the pool-wise statistics. .El .It Xo .Nm .Cm offline .Op Fl f .Op Fl t .Ar pool Ar device Ns ... .Xc Takes the specified physical device offline. While the .Ar device is offline, no attempt is made to read or write to the device. This command is not applicable to spares. .Bl -tag -width Ds .It Fl f Force fault. Instead of offlining the disk, put it into a faulted state. The fault will persist across imports unless the .Fl t flag was specified. .It Fl t Temporary. Upon reboot, the specified physical device reverts to its previous state. .El .It Xo .Nm .Cm online .Op Fl e .Ar pool Ar device Ns ... .Xc Brings the specified physical device online. This command is not applicable to spares. .Bl -tag -width Ds .It Fl e Expand the device to use all available space. If the device is part of a mirror or raidz then all devices must be expanded before the new space will become available to the pool. .El .It Xo .Nm .Cm reguid .Ar pool .Xc Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and healthy before performing this action. .It Xo .Nm .Cm reopen .Op Fl n .Ar pool .Xc Reopen all the vdevs associated with the pool. .Bl -tag -width Ds .It Fl n Do not restart an in-progress scrub operation. This is not recommended and can result in partially resilvered devices unless a second scrub is performed. .El .It Xo .Nm .Cm remove .Op Fl np .Ar pool Ar device Ns ... .Xc Removes the specified device from the pool. This command supports removing hot spare, cache, log, and both mirrored and non-redundant primary top-level vdevs, including dedup and special vdevs. When the primary pool storage includes a top-level raidz vdev only hot spare, cache, and log devices can be removed. .sp Removing a top-level vdev reduces the total amount of space in the storage pool. The specified device will be evacuated by copying all allocated space from it to the other devices in the pool. In this case, the .Nm zpool Cm remove command initiates the removal and returns, while the evacuation continues in the background. The removal progress can be monitored with .Nm zpool Cm status. The .Sy device_removal feature flag must be enabled to remove a top-level vdev, see .Xr zpool-features 5 . .Pp A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the same. Non-log devices or data devices that are part of a mirrored configuration can be removed using the .Nm zpool Cm detach command. .Bl -tag -width Ds .It Fl n Do not actually perform the removal ("no-op"). Instead, print the estimated amount of memory that will be used by the mapping table after the removal completes. This is nonzero only for top-level vdevs. .El .Bl -tag -width Ds .It Fl p Used in conjunction with the .Fl n flag, displays numbers as parsable (exact) values. .El .It Xo .Nm .Cm remove .Fl s .Ar pool .Xc Stops and cancels an in-progress removal of a top-level vdev. .It Xo .Nm .Cm replace .Op Fl f .Op Fl o Ar property Ns = Ns Ar value .Ar pool Ar device Op Ar new_device .Xc Replaces .Ar old_device with .Ar new_device . This is equivalent to attaching .Ar new_device , waiting for it to resilver, and then detaching .Ar old_device . .Pp The size of .Ar new_device must be greater than or equal to the minimum size of all the devices in a mirror or raidz configuration. .Pp .Ar new_device is required if the pool is not redundant. If .Ar new_device is not specified, it defaults to .Ar old_device . This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same .Pa /dev path as the old device, even though it is actually a different disk. ZFS recognizes this. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Sx Properties section for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . .El .It Xo .Nm .Cm scrub .Op Fl s | Fl p .Ar pool Ns ... .Xc Begins a scrub or resumes a paused scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated .Pq mirror or raidz devices, ZFS automatically repairs any damage discovered during the scrub. The .Nm zpool Cm status command reports the progress of the scrub and summarizes the results of the scrub upon completion. .Pp Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that ZFS knows to be out of date .Po for example, when attaching a new device to a mirror or replacing an existing device .Pc , whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure. .Pp Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows one at a time. If a scrub is paused, the .Nm zpool Cm scrub resumes it. If a resilver is in progress, ZFS does not allow a scrub to be started until the resilver completes. .Bl -tag -width Ds .It Fl s Stop scrubbing. .El .Bl -tag -width Ds .It Fl p Pause scrubbing. Scrub pause state and progress are periodically synced to disk. If the system is restarted or pool is exported during a paused scrub, even after import, scrub will remain paused until it is resumed. Once resumed the scrub will pick up from the place where it was last checkpointed to disk. To resume a paused scrub issue .Nm zpool Cm scrub again. .El .It Xo .Nm .Cm resilver .Ar pool Ns ... .Xc Starts a resilver. If an existing resilver is already running it will be restarted from the beginning. Any drives that were scheduled for a deferred resilver will be added to the new one. .It Xo .Nm .Cm set .Ar property Ns = Ns Ar value .Ar pool .Xc Sets the given property on the specified pool. See the .Sx Properties section for more information on what properties can be set and acceptable values. .It Xo .Nm .Cm split .Op Fl gLlnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool .Op Ar device ... .Xc Splits devices off .Ar pool creating .Ar newpool . All vdevs in .Ar pool must be mirrors and the pool must not be in the process of resilvering. At the time of the split, .Ar newpool will be a replica of .Ar pool . By default, the last device in each mirror is split from .Ar pool to create .Ar newpool . .Pp The optional device specification causes the specified device(s) to be included in the new .Ar pool and, should any devices remain unspecified, the last device in each mirror is used as would be by default. .Bl -tag -width Ds .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. .It Fl l Indicates that this command will request encryption keys for all encrypted datasets it attempts to mount as it is bringing the new pool online. Note that if any datasets have a .Sy keylocation of .Sy prompt this command will block waiting for the keys to be entered. Without this flag encrypted datasets will be left unavailable until the keys are loaded. .It Fl n Do dry run, do not actually perform the split. Print out the expected configuration of .Ar newpool . .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl o Ar property Ns = Ns Ar value Sets the specified property for .Ar newpool . See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Set .Sy altroot for .Ar newpool to .Ar root and automatically import it. .El .It Xo .Nm .Cm status .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... -.Op Fl gLPvxD +.Op Fl DgLpPsvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc Displays the detailed health status for the given pools. If no .Ar pool is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the .Sx Device Failure and Recovery section. .Pp If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width Ds .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... Run a script (or scripts) on each vdev and include the output as a new column in the .Nm zpool Cm status output. See the .Fl c option of .Nm zpool Cm iostat for complete details. .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. +.It Fl p +Display numbers in parsable (exact) values. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk and referenced .Pq logically referenced in the pool block counts and sizes by reference count. +.It Fl s +Display the number of leaf VDEV slow IOs. This is the number of IOs that +didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds). +This does not necessarily mean the IOs failed to complete, just took an +unreasonably long amount of time. This may indicate a problem with the +underlying storage. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Fl u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Fl d for standard date format. See .Xr date 1 . .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. Warnings about pools not using the latest on-disk format will not be included. .El .It Xo .Nm .Cm sync .Op Ar pool ... .Xc This command forces all in-core dirty data to be written to the primary pool storage and not the ZIL. It will also update administrative information including quota reporting. Without arguments, .Sy zpool sync will sync all pools on the system. Otherwise, it will sync only the specified pool(s). .It Xo .Nm .Cm upgrade .Xc Displays pools which do not have all supported features enabled and pools formatted using a legacy ZFS version number. These pools can continue to be used, but some features may not be available. Use .Nm zpool Cm upgrade Fl a to enable all features on all pools. .It Xo .Nm .Cm upgrade .Fl v .Xc Displays legacy ZFS versions supported by the current software. See .Xr zpool-features 5 for a description of feature flags features supported by the current software. .It Xo .Nm .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... .Xc Enables all supported features on the given pool. Once this is done, the pool will no longer be accessible on systems that do not support feature flags. See .Xr zpool-features 5 for details on compatibility with systems that support feature flags, but do not support all features enabled on the pool. .Bl -tag -width Ds .It Fl a Enables all supported features on all pools. .It Fl V Ar version Upgrade to the specified legacy version. If the .Fl V flag is specified, no features will be enabled on the pool. This option can only be used to increase the version number up to the last supported legacy version number. .El .El .Sh EXIT STATUS The following exit values are returned: .Bl -tag -width Ds .It Sy 0 Successful completion. .It Sy 1 An error occurred. .It Sy 2 Invalid command line options were specified. .El .Sh EXAMPLES .Bl -tag -width Ds .It Sy Example 1 No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that consists of six disks. .Bd -literal # zpool create tank raidz sda sdb sdc sdd sde sdf .Ed .It Sy Example 2 No Creating a Mirrored Storage Pool The following command creates a pool with two mirrors, where each mirror contains two disks. .Bd -literal # zpool create tank mirror sda sdb mirror sdc sdd .Ed .It Sy Example 3 No Creating a ZFS Storage Pool by Using Partitions The following command creates an unmirrored pool using two disk partitions. .Bd -literal # zpool create tank sda1 sdb2 .Ed .It Sy Example 4 No Creating a ZFS Storage Pool by Using Files The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes. .Bd -literal # zpool create tank /path/to/file/a /path/to/file/b .Ed .It Sy Example 5 No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool .Em tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Bd -literal # zpool add tank mirror sda sdb .Ed .It Sy Example 6 No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool .Em zion is faulted due to a missing device. The results from this command are similar to the following: .Bd -literal # zpool list NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT rpool 19.9G 8.43G 11.4G - 33% 42% 1.00x ONLINE - tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed .It Sy Example 7 No Destroying a ZFS Storage Pool The following command destroys the pool .Em tank and any datasets contained within. .Bd -literal # zpool destroy -f tank .Ed .It Sy Example 8 No Exporting a ZFS Storage Pool The following command exports the devices in pool .Em tank so that they can be relocated or later imported. .Bd -literal # zpool export tank .Ed .It Sy Example 9 No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool .Em tank for use on the system. The results from this command are similar to the following: .Bd -literal # zpool import pool: tank id: 15451357997522795478 state: ONLINE action: The pool can be imported using its name or numeric identifier. config: tank ONLINE mirror ONLINE sda ONLINE sdb ONLINE # zpool import tank .Ed .It Sy Example 10 No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of the software. .Bd -literal # zpool upgrade -a This system is currently running ZFS version 2. .Ed .It Sy Example 11 No Managing Hot Spares The following command creates a new pool with an available hot spare: .Bd -literal # zpool create tank mirror sda sdb spare sdc .Ed .Pp If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command: .Bd -literal # zpool replace tank sda sdd .Ed .Pp Once the data has been resilvered, the spare is automatically removed and is made available for use should another device fail. The hot spare can be permanently removed from the pool using the following command: .Bd -literal # zpool remove tank sdc .Ed .It Sy Example 12 No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Bd -literal # zpool create pool mirror sda sdb mirror sdc sdd log mirror \\ sde sdf .Ed .It Sy Example 13 No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: .Bd -literal # zpool add pool cache sdc sdd .Ed .Pp Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the .Cm iostat option as follows: .Bd -literal # zpool iostat -v pool 5 .Ed .It Sy Example 14 No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device .Sy mirror-1 . .Pp Given this configuration: .Bd -literal pool: tank state: ONLINE scrub: none requested config: NAME STATE READ WRITE CKSUM tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 sda ONLINE 0 0 0 sdb ONLINE 0 0 0 mirror-1 ONLINE 0 0 0 sdc ONLINE 0 0 0 sdd ONLINE 0 0 0 logs mirror-2 ONLINE 0 0 0 sde ONLINE 0 0 0 sdf ONLINE 0 0 0 .Ed .Pp The command to remove the mirrored log .Sy mirror-2 is: .Bd -literal # zpool remove tank mirror-2 .Ed .Pp The command to remove the mirrored data .Sy mirror-1 is: .Bd -literal # zpool remove tank mirror-1 .Ed .It Sy Example 15 No Displaying expanded space on a device The following command displays the detailed information for the pool .Em data . This pool is comprised of a single raidz vdev where one of its devices increased its capacity by 10GB. In this example, the pool will not be able to utilize this extra capacity until all the devices under the raidz vdev have been expanded. .Bd -literal # zpool list -v data NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - raidz1 23.9G 14.6G 9.30G - 48% sda - - - - - sdb - - - 10G - sdc - - - - - .Ed .It Sy Example 16 No Adding output columns Additional columns can be added to the .Nm zpool Cm status and .Nm zpool Cm iostat output with .Fl c option. .Bd -literal # zpool status -c vendor,model,size NAME STATE READ WRITE CKSUM vendor model size tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 U1 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U10 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U11 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U12 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U13 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T U14 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T # zpool iostat -vc slaves capacity operations bandwidth pool alloc free read write read write slaves ---------- ----- ----- ----- ----- ----- ----- --------- tank 20.4G 7.23T 26 152 20.7M 21.6M mirror 20.4G 7.23T 26 152 20.7M 21.6M U1 - - 0 31 1.46K 20.6M sdb sdff U10 - - 0 1 3.77K 13.3K sdas sdgw U11 - - 0 1 288K 13.3K sdat sdgx U12 - - 0 1 78.4K 13.3K sdau sdgy U13 - - 0 1 128K 13.3K sdav sdgz U14 - - 0 1 63.2K 13.3K sdfk sdg .Ed .El .Sh ENVIRONMENT VARIABLES .Bl -tag -width "ZFS_ABORT" .It Ev ZFS_ABORT Cause .Nm zpool to dump core on exit for the purposes of running .Sy ::findleaks . .El .Bl -tag -width "ZPOOL_IMPORT_PATH" .It Ev ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. This is a colon-separated list of directories in which .Nm zpool looks for device nodes and files. Similar to the .Fl d option in .Nm zpool import . .El .Bl -tag -width "ZPOOL_VDEV_NAME_GUID" .It Ev ZPOOL_VDEV_NAME_GUID Cause .Nm zpool subcommands to output vdev guids by default. This behavior is identical to the .Nm zpool status -g command line option. .El .Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" .It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS Cause .Nm zpool subcommands to follow links for vdev names by default. This behavior is identical to the .Nm zpool status -L command line option. .El .Bl -tag -width "ZPOOL_VDEV_NAME_PATH" .It Ev ZPOOL_VDEV_NAME_PATH Cause .Nm zpool subcommands to output full vdev path names by default. This behavior is identical to the .Nm zpool status -p command line option. .El .Bl -tag -width "ZFS_VDEV_DEVID_OPT_OUT" .It Ev ZFS_VDEV_DEVID_OPT_OUT Older ZFS on Linux implementations had issues when attempting to display pool config VDEV names if a .Sy devid NVP value is present in the pool's config. .Pp For example, a pool that originated on illumos platform would have a devid value in the config and .Nm zpool status would fail when listing the config. This would also be true for future Linux based pools. .Pp A pool can be stripped of any .Sy devid values on import or prevented from adding them on .Nm zpool create or .Nm zpool add by setting .Sy ZFS_VDEV_DEVID_OPT_OUT . .El .Bl -tag -width "ZPOOL_SCRIPTS_AS_ROOT" .It Ev ZPOOL_SCRIPTS_AS_ROOT Allow a privileged user to run the .Nm zpool status/iostat with the .Fl c option. Normally, only unprivileged users are allowed to run .Fl c . .El .Bl -tag -width "ZPOOL_SCRIPTS_PATH" .It Ev ZPOOL_SCRIPTS_PATH The search path for scripts when running .Nm zpool status/iostat with the .Fl c option. This is a colon-separated list of directories and overrides the default .Pa ~/.zpool.d and .Pa /etc/zfs/zpool.d search paths. .El .Bl -tag -width "ZPOOL_SCRIPTS_ENABLED" .It Ev ZPOOL_SCRIPTS_ENABLED Allow a user to run .Nm zpool status/iostat with the .Fl c option. If .Sy ZPOOL_SCRIPTS_ENABLED is not set, it is assumed that the user is allowed to run .Nm zpool status/iostat -c . .El .Sh INTERFACE STABILITY .Sy Evolving .Sh SEE ALSO .Xr zfs-events 5 , .Xr zfs-module-parameters 5 , .Xr zpool-features 5 , .Xr zed 8 , .Xr zfs 8 diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 78e701c80518..a99eb93a40c2 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,4650 +1,4653 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* target number of metaslabs per top-level vdev */ int vdev_max_ms_count = 200; /* minimum number of metaslabs per top-level vdev */ int vdev_min_ms_count = 16; /* practical upper limit of total metaslabs per top-level vdev */ int vdev_ms_count_limit = 1ULL << 17; /* lower limit for metaslab size (512M) */ int vdev_default_ms_shift = 29; /* upper limit for metaslab size (256G) */ int vdev_max_ms_shift = 38; int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ int vdev_dtl_sm_blksz = (1 << 12); /* - * Rate limit delay events to this many IO delays per second. + * Rate limit slow IO (delay) events to this many per second. */ -unsigned int zfs_delays_per_second = 20; +unsigned int zfs_slow_io_events_per_second = 20; /* * Rate limit checksum events after this many checksum errors per second. */ -unsigned int zfs_checksums_per_second = 20; +unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver * upon import when there are pool errors. */ int zfs_scan_ignore_errors = 0; /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ int vdev_standard_sm_blksz = (1 << 17); /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); if (vd->vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Derive the enumerated alloction bias from string input. * String origin is either the per-vdev zap or zpool(1M). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ - zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1); - zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1); + zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, + 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, + &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); /* * If creating a top-level vdev, check for allocation classes input */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } } vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_set_deferred_resilver(spa, vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } static void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); /* * The spa ashift values currently only reflect the * general vdev classes. Class destination is late * binding so ashift checking had to wait until now */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } #ifndef _KERNEL /* * To accomodate zdb_leak_init() fake indirect * metaslabs, we allocate a metaslab group for * indirect vdevs which normally don't have one. */ if (vd->vdev_mg == NULL) { ASSERT0(vdev_is_concrete(vd)); vdev_metaslab_group_create(vd); } #endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { uint64_t count = vd->vdev_ms_count; metaslab_group_passivate(vd->vdev_mg); for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { retry_sync: for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); } else { tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); if (tq == NULL) goto retry_sync; for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); taskq_destroy(tq); } vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; if (vd->vdev_ashift == 0) { vd->vdev_ashift = ashift; /* use detected value */ } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Detect if the alignment requirement has increased. * We don't want to make the pool unavailable, just * post an event instead. */ if (ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0, 0); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. * * DJB - TBD these should perhaps be tracked per allocation class * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_alloc_bias == VDEV_BIAS_NONE && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) { if (dsl_scan_resilvering(spa->spa_dsl_pool) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) vdev_set_deferred_resilver(spa, vd); else spa_async_request(spa, SPA_ASYNC_RESILVER); } return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; if (vdev_validate_skip) return (0); for (uint64_t c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; ASSERTV(spa_t *spa = vd->vdev_spa); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * In general, we aim for vdev_max_ms_count (200) metaslabs. The * range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^38 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 256GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * -------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 50TB ~200 * 50TB - 32PB one per 256GB * > 32PB ~131,072 * ------------------------------- */ if (ms_count < vdev_min_ms_count) ms_shift = highbit64(asize / vdev_min_ms_count); else if (ms_count > vdev_max_ms_count) ms_shift = highbit64(asize / vdev_max_ms_count); else ms_shift = vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > vdev_max_ms_shift) { ms_shift = vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > vdev_ms_count_limit) ms_shift = highbit64(asize / vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * Ignore the DTLs and try all devices. This avoids a recursive * mutex enter on the vdev_dtl_lock, and also makes us try hard * when loading the pool (relying on the checksum to ensure that * we get the right data -- note that we while loading, we are * only reading the MOS, which is always checksummed). */ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Returns B_TRUE if vdev determines offset needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (vd->vdev_resilver_txg == 0 || range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan completed cleanly. */ if (zfs_scan_ignore_errors && scn) scn->scn_phys.scn_errors = 0; /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag and dirty * the top level so that we persist the change. */ if (txg != 0 && vd->vdev_resilver_txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, NULL); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int error = 0; /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { error = vdev_load(vd->vdev_child[c]); if (error != 0) { return (error); } } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str) == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift))) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries * exclusively we can use sm_alloc to indicate the * cumulative checkpointed space that has been freed. */ vd->vdev_stat.vs_checkpoint_space = -vd->vdev_checkpoint_sm->sm_alloc; vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } space_map_update(vd->vdev_obsolete_sm); } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_has_checkpoint(spa)); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; if (range_tree_space(vd->vdev_obsolete_segments) > 0) { dmu_tx_t *tx; ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vdev_indirect_sync_obsolete(vd, tx); dmu_tx_commit(tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) { if (dsl_scan_resilvering(spa->spa_dsl_pool) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) vdev_set_deferred_resilver(spa, vd); else spa_async_request(spa, SPA_ASYNC_RESILVER); } spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { int t; for (t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { /* * Assuming 47 bits of the space map entry dedicated for the entry's * offset (see description in space_map.h), we calculate the maximum * address that can be described by a space map entry for the given * device. */ uint64_t shift = vd->vdev_ashift + 47; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report expandable space on top-level, non-auxillary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } if (vd->vdev_ops->vdev_op_leaf) vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0); vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { vs->vs_ops[type]++; vs->vs_bytes[type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[zio->io_priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[zio->io_priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[zio->io_priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %d active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_set_deferred_resilver(spa, vd->vdev_child[i]); if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) || range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { return; } vd->vdev_resilver_deferred = B_TRUE; spa->spa_resilver_deferred = B_TRUE; } #if defined(_KERNEL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ module_param(vdev_max_ms_count, int, 0644); MODULE_PARM_DESC(vdev_max_ms_count, "Target number of metaslabs per top-level vdev"); module_param(vdev_min_ms_count, int, 0644); MODULE_PARM_DESC(vdev_min_ms_count, "Minimum number of metaslabs per top-level vdev"); module_param(vdev_ms_count_limit, int, 0644); MODULE_PARM_DESC(vdev_ms_count_limit, "Practical upper limit of total metaslabs per top-level vdev"); -module_param(zfs_delays_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " - "IO delays per second"); +module_param(zfs_slow_io_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_slow_io_events_per_second, + "Rate limit slow IO (delay) events to this many per second"); -module_param(zfs_checksums_per_second, uint, 0644); - MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events " +module_param(zfs_checksum_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events " "to this many checksum errors per second (do not set below zed" "threshold)."); module_param(zfs_scan_ignore_errors, int, 0644); MODULE_PARM_DESC(zfs_scan_ignore_errors, "Ignore errors during resilver/scrub"); module_param(vdev_validate_skip, int, 0644); MODULE_PARM_DESC(vdev_validate_skip, "Bypass vdev_validate()"); /* END CSTYLED */ #endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7e86e3a8b3d9..b3425cf26243 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1,1694 +1,1697 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * Virtual Device Labels * --------------------- * * The vdev label serves several distinct purposes: * * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * * 2. Verify that all the devices given in a configuration are present * within the pool. * * 3. Determine the uberblock for the pool. * * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * * It is important to note that while the kernel is responsible for writing the * label, it only consumes the information in the first three cases. The * latter information is only consumed in userland when determining the * configuration to import a pool. * * * Label Organization * ------------------ * * Before describing the contents of the label, it's important to understand how * the labels are written and updated with respect to the uberblock. * * When the pool configuration is altered, either because it was newly created * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ * | | | | | | * | t10 | | t10 | | t10 | * | | | | | | * +------+ +------+ +------+ * * In this stable state, the labels and the uberblock were all updated within * the same transaction group (10). Each label is mirrored and checksummed, so * that we can detect when we fail partway through writing the label. * * In order to identify which labels are valid, the labels are written in the * following manner: * * 1. For each vdev, update 'L1' to the new label * 2. Update the uberblock * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the * UB, we will notice that L1's transaction group is greater than the uberblock, * so L2 must be valid. If we fail after writing the uberblock but before * writing L2, we will notice that L2's transaction group is less than L1, and * therefore L1 is valid. * * Another added complexity is that not every label is updated when the config * is synced. If we add a single device, we do not want to have to re-write * every label for every device in the pool. This means that both L1 and L2 may * be older than the pool uberblock, because the necessary information is stored * on another vdev. * * * On-disk Format * -------------- * * The vdev label consists of two distinct parts, and is wrapped within the * vdev_label_t structure. The label includes 8k of padding to permit legacy * VTOC disk labels, but is otherwise ignored. * * The first half of the label is a packed nvlist which contains pool wide * properties, per-vdev properties, and configuration information. It is * described in more detail below. * * The latter half of the label consists of a redundant array of uberblocks. * These uberblocks are updated whenever a transaction group is committed, * or when the configuration is updated. When a pool is loaded, we scan each * vdev for the 'best' uberblock. * * * Configuration Information * ------------------------- * * The nvlist describing the pool and vdev contains the following elements: * * version ZFS on-disk version * name Pool name * state Pool state * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * * top_guid Unique ID for top-level vdev in which this is contained * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. */ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } /* * Returns back the vdev label associated with the passed in offset. */ int vdev_label_number(uint64_t psize, uint64_t offset) { int l; if (offset >= psize - VDEV_LABEL_END_SIZE) { offset -= psize - VDEV_LABEL_END_SIZE; offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); } l = offset / sizeof (vdev_label_t); return (l < VDEV_LABELS ? l : -1); } static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } /* * Generate the nvlist representing this vdev's stats */ void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) { nvlist_t *nvx; vdev_stat_t *vs; vdev_stat_ex_t *vsx; vs = kmem_alloc(sizeof (*vs), KM_SLEEP); vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); vdev_get_stats_ex(vd, vs, vsx); fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); kmem_free(vs, sizeof (*vs)); /* * Add extended stats into a special extended stats nvlist. This keeps * all the extended stats nicely grouped together. The extended stats * nvlist is then added to the main nvlist. */ nvx = fnvlist_alloc(); /* ZIOs in flight to disk */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + /* IO delays */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); fnvlist_free(nvx); kmem_free(vsx, sizeof (*vsx)); } static void root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { spa_t *spa = vd->vdev_spa; if (vd != spa->spa_root_vdev) return; /* provide either current or previous scan information */ pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } pool_removal_stat_t prs; if (spa_removal_get_stats(spa, &prs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, sizeof (prs) / sizeof (uint64_t)); } pool_checkpoint_stat_t pcs; if (spa_checkpoint_get_stats(spa, &pcs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } } /* * Generate the nvlist representing this vdev's config. */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); if (vd->vdev_enc_sysfs_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); /* * Make sure someone hasn't managed to sneak a fancy new vdev * into a crufty old storage pool. */ ASSERT(vd->vdev_nparity == 1 || (vd->vdev_nparity <= 2 && spa_version(spa) >= SPA_VERSION_RAIDZ2) || (vd->vdev_nparity <= 3 && spa_version(spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add the nparity tag even on storage pools * that only support a single parity device -- older software * will just ignore it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); } if (vd->vdev_wholedisk != -1ULL) fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array); fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, vd->vdev_ms_shift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_removing) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: bias = VDEV_ALLOC_BIAS_LOG; break; case VDEV_BIAS_SPECIAL: bias = VDEV_ALLOC_BIAS_SPECIAL; break; case VDEV_BIAS_DEDUP: bias = VDEV_ALLOC_BIAS_DEDUP; break; default: ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } } if (vd->vdev_dtl_sm != NULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, space_map_object(vd->vdev_dtl_sm)); } if (vic->vic_mapping_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, vic->vic_mapping_object); } if (vic->vic_births_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, vic->vic_births_object); } if (vic->vic_prev_indirect_vdev != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, vic->vic_prev_indirect_vdev); } if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); if (flags & VDEV_CONFIG_MOS) { if (vd->vdev_leaf_zap != 0) { ASSERT(vd->vdev_ops->vdev_op_leaf); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, vd->vdev_leaf_zap); } if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, vd->vdev_top_zap); } if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER); } } if (getstats) { vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context * (spa_get_stats()), so we need the rwlock to prevent * the mapping from being changed by condensing. */ rw_enter(&vd->vdev_indirect_rwlock, RW_READER); if (vd->vdev_indirect_mapping != NULL) { ASSERT(vd->vdev_indirect_births != NULL); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, vdev_indirect_mapping_size(vim)); } rw_exit(&vd->vdev_indirect_rwlock); if (vd->vdev_mg != NULL && vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { /* * Compute approximately how much memory would be used * for the indirect mapping if this device were to * be removed. * * Note: If the frag metric is invalid, then not * enough metaslabs have been converted to have * histograms. */ uint64_t seg_count = 0; uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one * entry per free segment. However, small free * segments (smaller than vdev_removal_max_span) * will be combined with adjacent allocated segments * as a single mapping. */ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (1ULL << (i + 1) < vdev_removal_max_span) { to_alloc += vd->vdev_mg->mg_histogram[i] << (i + 1); } else { seg_count += vd->vdev_mg->mg_histogram[i]; } } /* * The maximum length of a mapping is * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ seg_count += to_alloc / zfs_remove_max_segment; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * sizeof (vdev_indirect_mapping_entry_phys_t)); } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; int c, idx; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); for (c = 0, idx = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; /* * If we're generating an nvlist of removing * vdevs then skip over any device which is * not being removed. */ if ((flags & VDEV_CONFIG_REMOVING) && !cvd->vdev_removing) continue; child[idx++] = vdev_config_generate(spa, cvd, getstats, flags); } if (idx) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx); } for (c = 0; c < idx; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { const char *aux = NULL; if (vd->vdev_offline && !vd->vdev_tmpoffline) fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); if (vd->vdev_removed) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); if (vd->vdev_unspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); if (vd->vdev_ishole) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); /* Set the reason why we're FAULTED/DEGRADED. */ switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: aux = "err_exceeded"; break; case VDEV_AUX_EXTERNAL: aux = "external"; break; } if (aux != NULL && !vd->vdev_tmpoffline) { fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); } else { /* * We're healthy - clear any previous AUX_STATE values. */ if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); } if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, vd->vdev_orig_guid); } } return (nv); } /* * Generate a view of the top-level vdevs. If we currently have holes * in the namespace, then generate an array which contains a list of holey * vdevs. Additionally, add the number of top-level children that currently * exist. */ void vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) { array[idx++] = c; } } if (idx) { VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, array, idx) == 0); } VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, rvd->vdev_children) == 0); kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } /* * Returns the configuration from the label of the given vdev. For vdevs * which don't have a txg value stored on their label (i.e. spares/cache) * or have not been completely initialized (txg = 0) just return * the configuration from the first valid label we find. Otherwise, * find the most up-to-date label that does not exceed the specified * 'txg' value. */ nvlist_t * vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); retry: for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; zio = zio_root(spa, NULL, NULL, flags); vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been * completely initialized so just return the * configuration from the first valid label we * encounter. */ error = nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; nvlist_free(config); config = fnvlist_dup(label); } } if (label != NULL) { nvlist_free(label); label = NULL; } } if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } /* * We found a valid label but it didn't pass txg restrictions. */ if (config == NULL && label_txg != 0) { vdev_dbgmsg(vd, "label discarded as txg is too large " "(%llu > %llu)", (u_longlong_t)label_txg, (u_longlong_t)txg); } abd_free(vp_abd); return (config); } /* * Determine if a device is in use. The 'spare_guid' parameter will be filled * in with the device guid if this spare is active elsewhere on the system. */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; uint64_t vdtxg = 0; nvlist_t *label; if (spare_guid) *spare_guid = 0ULL; if (l2cache_guid) *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. */ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &vdtxg); if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &device_guid) != 0) { nvlist_free(label); return (B_FALSE); } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &txg) != 0)) { nvlist_free(label); return (B_FALSE); } nvlist_free(label); /* * Check to see if this device indeed belongs to the pool it claims to * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && !spa_spare_exists(device_guid, NULL, NULL) && !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* * If the transaction group is zero, then this an initialized (but * unused) label. This is only an error if the create transaction * on-disk is the same as the one we're using now, in which case the * user has attempted to add the same vdev multiple times in the same * transaction. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares * to add. We also check if it is an l2cache device. */ if (spa_spare_exists(device_guid, &spare_pool, NULL) || spa_has_spare(spa, device_guid)) { if (spare_guid) *spare_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: case VDEV_LABEL_L2CACHE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_spare(spa, device_guid) || spare_pool != 0ULL); case VDEV_LABEL_SPARE: return (spa_has_spare(spa, device_guid)); default: break; } } /* * Check to see if this is an l2cache device. */ if (spa_l2cache_exists(device_guid, NULL)) return (B_TRUE); /* * We can't rely on a pool's state if it's been imported * read-only. Instead we look to see if the pools is marked * read-only in the namespace and set the state to active. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (spa = spa_by_guid(pool_guid, device_guid)) != NULL && spa_mode(spa) == FREAD) state = POOL_STATE_ACTIVE; /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ return (state == POOL_STATE_ACTIVE); } /* * Initialize a vdev label. We check to make sure each leaf device is not in * use, and writable. We put down an initial label which we will later * overwrite with a complete label. Note that it's important to do this * sequentially, not in parallel, so that we catch cases of multiple use of the * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with * itself. */ int vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) { spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; abd_t *pad2; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); /* Track the creation time for this vdev */ vd->vdev_crtxg = crtxg; if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) return (0); /* * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) return (SET_ERROR(EIO)); /* * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (SET_ERROR(EBUSY)); /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the * guid (which was initialized to a random value) to reflect the * actual GUID (which is shared between multiple pools). */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && spare_guid != 0ULL) { uint64_t guid_delta = spare_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE || reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && l2cache_guid != 0ULL) { uint64_t guid_delta = l2cache_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding an l2cache, then it's * already labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_L2CACHE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE); } /* * Initialize its label. */ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. * We mark it as being from txg 0 to indicate that it's not * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ if (reason == VDEV_LABEL_SPARE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { /* * For inactive hot spares, we generate a special label that * identifies as a mutually shared hot spare. We write the * label if we are adding a hot spare, or if we are removing an * active hot spare (in which case we want to revert the * labels). */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else if (reason == VDEV_LABEL_L2CACHE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { /* * For level 2 ARC devices, add a special label. */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else { uint64_t txg = 0ULL; if (reason == VDEV_LABEL_SPLIT) txg = spa->spa_uberblock.ub_txg; label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple * vdev uses as described above, and automatically expires if we * fail. */ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0); } buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); } /* * Initialize uberblock template. */ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); abd_zero(ub_abd, VDEV_UBERBLOCK_RING); abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. */ retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); /* * Skip the 1st padding area. * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ vdev_label_write(zio, vd, l, pad2, offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } nvlist_free(label); abd_free(pad2); abd_free(ub_abd); abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. Do the same for * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || spa_spare_exists(vd->vdev_guid, NULL, NULL))) spa_spare_add(vd); if (error == 0 && !vd->vdev_isl2cache && (reason == VDEV_LABEL_L2CACHE || spa_l2cache_exists(vd->vdev_guid, NULL))) spa_l2cache_add(vd); return (error); } /* * ========================================================================== * uberblock load/sync * ========================================================================== */ /* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp)); } struct ubl_cbdata { uberblock_t *ubl_ubbest; /* Best uberblock */ vdev_t *ubl_vd; /* vdev associated with the above */ }; static void vdev_uberblock_load_done(zio_t *zio) { vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* * Keep track of the vdev in which this uberblock * was found. We will use this information later * to obtain the config nvlist associated with * this uberblock. */ *cbp->ubl_ubbest = *ub; cbp->ubl_vd = vd; } mutex_exit(&rio->io_lock); } abd_free(zio->io_abd); } static void vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } } } } /* * Reads the 'best' uberblock from disk along with its associated * configuration. First, we read the uberblock array of each label of each * vdev, keeping track of the uberblock with the highest txg in each array. * Then, we read the configuration from the same vdev as the best uberblock. */ void vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) { zio_t *zio; spa_t *spa = rvd->vdev_spa; struct ubl_cbdata cb; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(ub); ASSERT(config); bzero(ub, sizeof (uberblock_t)); *config = NULL; cb.ubl_ubbest = ub; cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); vdev_uberblock_load_impl(zio, rvd, flags, &cb); (void) zio_wait(zio); /* * It's possible that the best uberblock was discovered on a label * that has a configuration which was written in a future txg. * Search all labels on this vdev to find the configuration that * matches the txg for our uberblock. */ if (cb.ubl_vd != NULL) { vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " "Trying again without txg restrictions."); *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); } if (*config == NULL) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); } } spa_config_exit(spa, SCL_ALL, FTAG); } /* * For use when a leaf vdev is expanded. * The location of labels 2 and 3 changed, and at the new location the * uberblock rings are either empty or contain garbage. The sync will write * new configs there because the vdev is dirty, but expansion also needs the * uberblock rings copied. Read them from label 0 which did not move. * * Since the point is to populate labels {2,3} with valid uberblocks, * we zero uberblocks we fail to read or which are not valid. */ static void vdev_copy_uberblocks(vdev_t *vd) { abd_t *ub_abd; zio_t *write_zio; int locks = (SCL_L2ARC | SCL_ZIO); int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { const int src_label = 0; zio_t *zio; zio = zio_root(vd->vdev_spa, NULL, NULL, flags); vdev_label_read(zio, vd, src_label, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); for (int l = 2; l < VDEV_LABELS; l++) vdev_label_write(write_zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags | ZIO_FLAG_DONT_PROPAGATE); } (void) zio_wait(write_zio); spa_config_exit(vd->vdev_spa, locks, FTAG); abd_free(ub_abd); } /* * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). */ static void vdev_uberblock_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) atomic_inc_64(good_writes); } /* * Write the uberblock to all labels of all leaves of the specified vdev. */ static void vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, uberblock_t *ub, vdev_t *vd, int flags) { for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_uberblock_sync(zio, good_writes, ub, vd->vdev_child[c], flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { vdev_copy_uberblocks(vd); vd->vdev_copy_uberblocks = B_FALSE; } int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); (void) zio_wait(zio); /* * Flush the uberblocks to disk. This ensures that the odd labels * are no longer needed (because the new uberblocks and the even * labels are safely on disk), so it is safe to overwrite them. */ zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { zio_flush(zio, svd[v]); } } (void) zio_wait(zio); return (good_writes >= 1 ? 0 : EIO); } /* * On success, increment the count of good writes for our top-level vdev. */ static void vdev_label_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) atomic_inc_64(good_writes); } /* * If there weren't enough good writes, indicate failure to the parent. */ static void vdev_label_sync_top_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (*good_writes == 0) zio->io_error = SET_ERROR(EIO); kmem_free(good_writes, sizeof (uint64_t)); } /* * We ignore errors for log and cache devices, simply free the private data. */ static void vdev_label_sync_ignore_done(zio_t *zio) { kmem_free(zio->io_private, sizeof (uint64_t)); } /* * Write all even or odd labels to all leaves of the specified vdev. */ static void vdev_label_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; char *buf; size_t buflen; for (int c = 0; c < vd->vdev_children; c++) { vdev_label_sync(zio, good_writes, vd->vdev_child[c], l, txg, flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * Generate a label describing the top-level config to which we belong. */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } abd_free(vp_abd); nvlist_free(label); } int vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { list_t *dl = &spa->spa_config_dirty_list; vdev_t *vd; zio_t *zio; int error; /* * Write the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes; ASSERT(!vd->vdev_ishole); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } error = zio_wait(zio); /* * Flush the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) zio_flush(zio, vd); (void) zio_wait(zio); return (error); } /* * Sync the uberblock and any changes to the vdev configuration. * * The order of operations is carefully crafted to ensure that * if the system panics or loses power at any time, the state on disk * is still transactionally consistent. The in-line comments below * describe the failure semantics at each stage. * * Moreover, vdev_config_sync() is designed to be idempotent: if it fails * at any time, you can just call it again, and it will resume its work. */ int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(svdcount != 0); retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the * sync process to block while we retry. But if we can't write a * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } ASSERT(ub->ub_txg <= txg); /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, * and the vdev configuration hasn't changed, * then there's nothing to do. */ if (ub->ub_txg < txg) { boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); if (!changed && list_is_empty(&spa->spa_config_dirty_list)) return (0); } if (txg > spa_freeze_txg(spa)) return (0); ASSERT(txg <= spa->spa_final_txg); /* * Flush the write cache of every disk that's been written to * in this transaction group. This ensures that all blocks * written in this txg will be committed to stable storage * before any uberblock that references them. */ zio_t *zio = zio_root(spa, NULL, NULL, flags); for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); (void) zio_wait(zio); /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the * even labels that made it to disk will be newer than any uberblock, * and will therefore be considered invalid. The odd labels (L1, L3), * which have not yet been touched, will still be valid. We flush * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the even labels " "of dirty vdevs", error, spa_name(spa)); } goto retry; } /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases * to consider, and the on-disk state is consistent either way: * * (1) If none of the new uberblocks made it to disk, then the * previous uberblock will be the newest, and the odd labels * (which had not yet been touched) will be valid with respect * to that uberblock. * * (2) If one or more new uberblocks made it to disk, then they * will be the newest, and the even labels (which had all * been successfully committed) will be valid with respect * to the new uberblocks. */ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_uberblock_sync_list() returned error " "%d for pool '%s'", error, spa_name(spa)); } goto retry; } if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new * uberblocks will suffice to open the pool. The next time * the pool is opened, the first thing we'll do -- before any * user data is modified -- is mark every vdev dirty so that * all labels will be brought up to date. We flush the new labels * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the odd labels of " "dirty vdevs", error, spa_name(spa)); } goto retry; } return (0); } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index e604f33c8389..579aa0380411 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1,1050 +1,1083 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This general routine is responsible for generating all the different ZFS * ereports. The payload is dependent on the class, and which arguments are * supplied to the function: * * EREPORT POOL VDEV IO * block X X X * data X X * device X X * pool X * * If we are in a loading state, all errors are chained together by the same * SPA-wide ENA (Error Numeric Association). * * For isolated I/O requests, we get the ENA from the zio_t. The propagation * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want * to chain together all ereports associated with a logical piece of data. For * read I/Os, there are basically three 'types' of I/O, which form a roughly * layered diagram: * * +---------------+ * | Aggregate I/O | No associated logical data or device * +---------------+ * | * V * +---------------+ Reads associated with a piece of logical data. * | Read I/O | This includes reads on behalf of RAID-Z, * +---------------+ mirrors, gang blocks, retries, etc. * | * V * +---------------+ Reads associated with a particular device, but * | Physical I/O | no logical data. Issued as part of vdev caching * +---------------+ and I/O aggregation. * * Note that 'physical I/O' here is not the same terminology as used in the rest * of ZIO. Typically, 'physical I/O' simply means that there is no attached * blockpointer. But I/O with no associated block pointer can still be related * to a logical piece of data (i.e. RAID-Z requests). * * Purely physical I/O always have unique ENAs. They are not related to a * particular piece of logical data, and therefore cannot be chained together. * We still generate an ereport, but the DE doesn't correlate it with any * logical piece of data. When such an I/O fails, the delegated I/O requests * will issue a retry, which will trigger the 'real' ereport with the correct * ENA. * * We keep track of the ENA for a ZIO chain through the 'io_logical' member. * When a new logical I/O is issued, we set this to point to itself. Child I/Os * then inherit this pointer, so that when it is first set subsequent failures * will use the same ENA. For vdev cache fill and queue aggregation I/O, * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). * * For checksum errors, we want to include more information about the actual * error which occurs. Accordingly, we build an ereport when the error is * noticed, but instead of sending it in immediately, we hang it off of the * io_cksum_report field of the logical IO. When the logical IO completes * (successfully or not), zfs_ereport_finish_checksum() is called with the * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ #ifdef _KERNEL void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { if (nvl) fm_nvlist_destroy(nvl, FM_NVA_FREE); if (detector) fm_nvlist_destroy(detector, FM_NVA_FREE); } /* * We want to rate limit ZIO delay and checksum events so as to not * flood ZED when a disk is acting up. * * Returns 1 if we're ratelimiting, 0 if not. */ static int zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) { int rc = 0; /* * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * are. Invert it to get our return value. */ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { rc = !zfs_ratelimit(&vd->vdev_delay_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { rc = !zfs_ratelimit(&vd->vdev_checksum_rl); } if (rc) { /* We're rate limiting */ fm_erpt_dropped_increment(); } return (rc); } -static void +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { nvlist_t *ereport, *detector; uint64_t ena; char class[64]; - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; - - if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && - (zio != NULL) && (!zio->io_timestamp)) { - /* Ignore bogus delay events */ - return; - } + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; + return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; + return (B_FALSE); } /* * Serialize ereport generation */ mutex_enter(&spa->spa_errlist_lock); /* * Determine the ENA to use for this event. If we are in a loading * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = spa->spa_ena; } else if (zio != NULL && zio->io_logical != NULL) { if (zio->io_logical->io_ena == 0) zio->io_logical->io_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = zio->io_logical->io_ena; } else { ena = fm_ena_generate(0, FM_ENA_FMT1); } /* * Construct the full class, detector, and other standard FMA fields. */ (void) snprintf(class, sizeof (class), "%s.%s", ZFS_ERROR_CLASS, subclass); fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), vd != NULL ? vd->vdev_guid : 0); fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); /* * Construct the per-ereport payload, depending on which parameters are * passed in. */ /* * Generic payload members common to all ereports. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, (uint64_t)spa_state(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, (int32_t)spa_load_state(spa), NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, DATA_TYPE_STRING, spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? FM_EREPORT_FAILMODE_WAIT : spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, NULL); if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; vdev_queue_t *vq = &vd->vdev_queue; vdev_stat_t *vs = &vd->vdev_stat; vdev_t *spare_vd; uint64_t *spare_guids; char **spare_paths; int i, spare_count; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); if (vd->vdev_fru != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); if (vd->vdev_enc_sysfs_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); if (vd->vdev_ashift) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (vq != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); } if (vs != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, DATA_TYPE_UINT64, vs->vs_read_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, - DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); + DATA_TYPE_UINT64, vs->vs_checksum_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, + DATA_TYPE_UINT64, vs->vs_slow_ios, + NULL); } if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, DATA_TYPE_UINT64, pvd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, NULL); if (pvd->vdev_path) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, DATA_TYPE_STRING, pvd->vdev_path, NULL); if (pvd->vdev_devid) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, DATA_TYPE_STRING, pvd->vdev_devid, NULL); } spare_count = spa->spa_spares.sav_count; spare_paths = kmem_zalloc(sizeof (char *) * spare_count, KM_SLEEP); spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, KM_SLEEP); for (i = 0; i < spare_count; i++) { spare_vd = spa->spa_spares.sav_vdevs[i]; if (spare_vd) { spare_paths[i] = spare_vd->vdev_path; spare_guids[i] = spare_vd->vdev_guid; } } fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); kmem_free(spare_guids, sizeof (uint64_t) * spare_count); kmem_free(spare_paths, sizeof (char *) * spare_count); } if (zio != NULL) { /* * Payload common to all I/Os. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, DATA_TYPE_INT32, zio->io_flags, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, DATA_TYPE_UINT32, zio->io_stage, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a * RAID-Z or other I/O where the physical offset and length are * provided for us, instead of within the zio_t. */ if (vd != NULL) { if (size) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, stateoroffset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, size, NULL); else fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, zio->io_offset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, zio->io_size, NULL); } } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the * 'stateoroffset' parameter indicates the previous state of the * vdev. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, DATA_TYPE_UINT64, stateoroffset, NULL); } /* * Payload for I/Os with corresponding logical information. */ - if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, DATA_TYPE_UINT64, zb->zb_object, FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; + return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) #define MAX_RANGES 16 typedef struct zfs_ecksum_info { /* histograms of set and cleared bits by bit number in a 64-bit word */ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; /* * for each range, the number of bits set and cleared. The Hamming * distance between the good and bad buffers is the sum of them all. */ uint32_t zei_range_sets[MAX_RANGES]; uint32_t zei_range_clears[MAX_RANGES]; struct zei_ranges { uint32_t zr_start; uint32_t zr_end; } zei_ranges[MAX_RANGES]; size_t zei_range_count; uint32_t zei_mingap; uint32_t zei_allowed_mingap; } zfs_ecksum_info_t; static void update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) { size_t i; size_t bits = 0; uint64_t value = BE_64(value_arg); /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { if (value & (1ull << i)) { hist[63 - i]++; ++bits; } } /* update the count of bits changed */ *count += bits; } /* * We've now filled up the range array, and need to increase "mingap" and * shrink the range list accordingly. zei_mingap is always the smallest * distance between array entries, so we set the new_allowed_gap to be * one greater than that. We then go through the list, joining together * any ranges which are closer than the new_allowed_gap. * * By construction, there will be at least one. We also update zei_mingap * to the new smallest gap, to prepare for our next invocation. */ static void zei_shrink_ranges(zfs_ecksum_info_t *eip) { uint32_t mingap = UINT32_MAX; uint32_t new_allowed_gap = eip->zei_mingap + 1; size_t idx, output; size_t max = eip->zei_range_count; struct zei_ranges *r = eip->zei_ranges; ASSERT3U(eip->zei_range_count, >, 0); ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); output = idx = 0; while (idx < max - 1) { uint32_t start = r[idx].zr_start; uint32_t end = r[idx].zr_end; while (idx < max - 1) { idx++; uint32_t nstart = r[idx].zr_start; uint32_t nend = r[idx].zr_end; uint32_t gap = nstart - end; if (gap < new_allowed_gap) { end = nend; continue; } if (gap < mingap) mingap = gap; break; } r[output].zr_start = start; r[output].zr_end = end; output++; } ASSERT3U(output, <, eip->zei_range_count); eip->zei_range_count = output; eip->zei_mingap = mingap; eip->zei_allowed_mingap = new_allowed_gap; } static void zei_add_range(zfs_ecksum_info_t *eip, int start, int end) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; if (count >= MAX_RANGES) { zei_shrink_ranges(eip); count = eip->zei_range_count; } if (count == 0) { eip->zei_mingap = UINT32_MAX; eip->zei_allowed_mingap = 1; } else { int gap = start - r[count - 1].zr_end; if (gap < eip->zei_allowed_mingap) { r[count - 1].zr_end = end; return; } if (gap < eip->zei_mingap) eip->zei_mingap = gap; } r[count].zr_start = start; r[count].zr_end = end; eip->zei_range_count++; } static size_t zei_range_total_size(zfs_ecksum_info_t *eip) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; size_t result = 0; size_t idx; for (idx = 0; idx < count; idx++) result += (r[idx].zr_end - r[idx].zr_start); return (result); } static zfs_ecksum_info_t * annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, const abd_t *goodabd, const abd_t *badabd, size_t size, boolean_t drop_if_identical) { const uint64_t *good; const uint64_t *bad; uint64_t allset = 0; uint64_t allcleared = 0; size_t nui64s = size / sizeof (uint64_t); size_t inline_size; int no_inline = 0; size_t idx; size_t range; size_t offset = 0; ssize_t start = -1; zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); /* don't do any annotation for injected checksum errors */ if (info != NULL && info->zbc_injected) return (eip); if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_expected) / sizeof (uint64_t), (uint64_t *)&info->zbc_expected, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_actual) / sizeof (uint64_t), (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, NULL); if (info->zbc_byteswapped) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, DATA_TYPE_BOOLEAN, 1, NULL); } } if (badabd == NULL || goodabd == NULL) return (eip); ASSERT3U(nui64s, <=, UINT32_MAX); ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, <=, UINT32_MAX); good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); /* build up the range list by comparing the two buffers. */ for (idx = 0; idx < nui64s; idx++) { if (good[idx] == bad[idx]) { if (start == -1) continue; zei_add_range(eip, start, idx); start = -1; } else { if (start != -1) continue; start = idx; } } if (start != -1) zei_add_range(eip, start, idx); /* See if it will fit in our inline buffers */ inline_size = zei_range_total_size(eip); if (inline_size > ZFM_MAX_INLINE) no_inline = 1; /* * If there is no change and we want to drop if the buffers are * identical, do so. */ if (inline_size == 0 && drop_if_identical) { kmem_free(eip, sizeof (*eip)); abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); return (NULL); } /* * Now walk through the ranges, filling in the details of the * differences. Also convert our uint64_t-array offsets to byte * offsets. */ for (range = 0; range < eip->zei_range_count; range++) { size_t start = eip->zei_ranges[range].zr_start; size_t end = eip->zei_ranges[range].zr_end; for (idx = start; idx < end; idx++) { uint64_t set, cleared; // bits set in bad, but not in good set = ((~good[idx]) & bad[idx]); // bits set in good, but not in bad cleared = (good[idx] & (~bad[idx])); allset |= set; allcleared |= cleared; if (!no_inline) { ASSERT3U(offset, <, inline_size); eip->zei_bits_set[offset] = set; eip->zei_bits_cleared[offset] = cleared; offset++; } update_histogram(set, eip->zei_histogram_set, &eip->zei_range_sets[range]); update_histogram(cleared, eip->zei_histogram_cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ eip->zei_ranges[range].zr_start *= sizeof (uint64_t); eip->zei_ranges[range].zr_end *= sizeof (uint64_t); } abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); eip->zei_allowed_mingap *= sizeof (uint64_t); inline_size *= sizeof (uint64_t); /* fill in ereport */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, (uint32_t *)eip->zei_ranges, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, DATA_TYPE_UINT32, eip->zei_allowed_mingap, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, NULL); if (!no_inline) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); } else { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, NULL); } return (eip); } #endif -void +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (zfs_is_ratelimiting_event(subclass, vd)) - return; + return (SET_ERROR(EBUSY)); - zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size); + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) - return; + return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif + return (rc); } void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) { zio_cksum_report_t *report; - #ifdef _KERNEL if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return; #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); if (zio->io_vsd != NULL) zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); else zio_vsd_default_cksum_report(zio, report, arg); /* copy the checksum failure information if it was provided */ if (info != NULL) { report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); bcopy(info, report->zcr_ckinfo, sizeof (*info)); } report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; report->zcr_length = length; #ifdef _KERNEL zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); return; } #endif mutex_enter(&spa->spa_errlist_lock); report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); } void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical) { #ifdef _KERNEL zfs_ecksum_info_t *info; info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, good_data, bad_data, report->zcr_length, drop_if_identical); if (info != NULL) zfs_zevent_post(report->zcr_ereport, report->zcr_detector, zfs_zevent_post_cb); else zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); report->zcr_ereport = report->zcr_detector = NULL; if (info != NULL) kmem_free(info, sizeof (*info)); #endif } void zfs_ereport_free_checksum(zio_cksum_report_t *rpt) { #ifdef _KERNEL if (rpt->zcr_ereport != NULL) { fm_nvlist_destroy(rpt->zcr_ereport, FM_NVA_FREE); fm_nvlist_destroy(rpt->zcr_detector, FM_NVA_FREE); } #endif rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); if (rpt->zcr_ckinfo != NULL) kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); kmem_free(rpt, sizeof (*rpt)); } -void +int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, - spa, vd, zb, zio, offset, length); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return (EBUSY); - if (ereport == NULL) - return; + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { - zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif + return (rc); } /* * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h * and are designed to be consumed by the ZFS Event Daemon (ZED). For * additional details refer to the zed(8) man page. */ nvlist_t * zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { nvlist_t *resource = NULL; #ifdef _KERNEL char class[64]; if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return (NULL); if ((resource = fm_nvlist_create(NULL)) == NULL) return (NULL); (void) snprintf(class, sizeof (class), "%s.%s.%s", type, ZFS_ERROR_CLASS, name); VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); VERIFY0(nvlist_add_int32(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); if (vd) { VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); if (vd->vdev_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); if (vd->vdev_devid != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); if (vd->vdev_fru != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); if (vd->vdev_enc_sysfs_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path)); } /* also copy any optional payload data */ if (aux) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) (void) nvlist_add_nvpair(resource, elem); } #endif return (resource); } static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, type, name, aux); if (resource) zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); #endif } /* * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev * has been removed from the system. This will cause the DE to ignore any * recent I/O errors, inferring that they are due to the asynchronous device * removal. */ void zfs_post_remove(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); } /* * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool * has the 'autoreplace' property set, and therefore any broken vdevs will be * handled by higher level logic, and no vdev fault should be generated. */ void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); } /* * The 'resource.fs.zfs.statechange' event is an internal signal that the * given vdev has transitioned its state to DEGRADED or HEALTHY. This will * cause the retire agent to repair any outstanding fault management cases * open because the device was not found (fault.fs.zfs.device). */ void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) { #ifdef _KERNEL nvlist_t *aux; /* * Add optional supplemental keys to payload */ aux = fm_nvlist_create(NULL); if (vd && aux) { if (vd->vdev_physpath) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, vd->vdev_physpath); } if (vd->vdev_enc_sysfs_path) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); } (void) nvlist_add_uint64(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); } zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, aux); if (aux) fm_nvlist_destroy(aux, FM_NVA_FREE); #endif } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); #endif /* _KERNEL */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e8c2ca89aff9..6f1aa640dadb 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,4847 +1,4867 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; int zio_dva_throttle_enabled = B_TRUE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif -int zio_delay_max = ZIO_DELAY_MAX; +/* Mark IOs as "slow" if they take longer than 30 seconds */ +int zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; vmem_t *data_alloc_arena = NULL; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; #if defined(_ILP32) && defined(_KERNEL) /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (size > zfs_max_recordsize) break; #endif while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size < PAGESIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; } #endif if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, data_alloc_arena, cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { #ifdef _ILP32 /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; #endif #if defined(ZFS_DEBUG) && !defined(_KERNEL) if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((c + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[c], (long long unsigned)zio_buf_cache_frees[c]); #endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); /* * Ensure that anyone expecting this zio to contain a linear ABD isn't * going to get a nasty surprise when they try to access the data. */ IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but for the moment * enum zio_flag is out of bits. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. Otherwise dispatch the parent zio as its own task. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } static void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return; /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { zfs_blkptr_verify(spa, bp); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zfs_blkptr_verify(spa, bp); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize); if (psize == 0) compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, (task_func_t *)zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio: zio=%p timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s last=%llu " "type=%d priority=%d flags=0x%x " "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu level=%llu blkid=%llu " "offset=%llu size=%llu error=%d", pio, pio->io_timestamp, delta, pio->io_delta, pio->io_delay, vd->vdev_path, vq->vq_io_complete_ts, pio->io_type, pio->io_priority, pio->io_flags, pio->io_state, pio->io_pipeline, pio->io_pipeline_trace, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(zio_t *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; kpreempt_disable(); pio = spa->spa_async_zio_root[CPU_SEQID]; kpreempt_enable(); zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_put(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_put(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_put(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; ASSERTV(zio_t *gio = zio->io_gang_leader); if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_put(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * encrypted blocks need DVA[2] free so encrypted gang headers can't * have a third copy. */ gbh_copies = MIN(copies + 1, spa_max_replication(spa)); if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) gbh_copies = SPA_DVAS_PER_BP - 1; int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); bzero(zp.zp_iv, ZIO_DATA_IV_LEN); bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; ASSERTV(zio_prop_t *zp = &zio->io_prop); blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (zio); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (zio); } ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio->io_metaslab_class = mc; avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); nio = zio_io_to_allocate(spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_alloc_locks[allocator]); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; mc = spa_normal_class(spa); VERIFY(metaslab_class_throttle_reserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } else { mc = spa_normal_class(spa); } zio->io_metaslab_class = mc; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) *slog = FALSE; } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_removing) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent bit so * that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this consitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { ASSERTV(zio_t *lio = zio->io_logical); zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recurisvely up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held( &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { - if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); + if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { + /* + * We want to only increment our slow IO counters if + * the IO is valid (i.e. not if the drive is removed). + * + * zfs_ereport_post() will also do these checks, but + * it can also ratelimit and have other failures, so we + * need to increment the slow_io counters independent + * of it. + */ + if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_slow_ios++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0, 0); + } + } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark); zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } #if defined(_KERNEL) EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); -module_param(zio_delay_max, int, 0644); -MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); +module_param(zio_slow_io_ms, int, 0644); +MODULE_PARM_DESC(zio_slow_io_ms, + "Max I/O completion time (milliseconds) before marking it as slow"); module_param(zio_requeue_io_start_cut_in_line, int, 0644); MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); module_param(zfs_sync_pass_deferred_free, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_deferred_free, "Defer frees starting in this pass"); module_param(zfs_sync_pass_dont_compress, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_dont_compress, "Don't compress starting in this pass"); module_param(zfs_sync_pass_rewrite, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_rewrite, "Rewrite new bps starting in this pass"); module_param(zio_dva_throttle_enabled, int, 0644); MODULE_PARM_DESC(zio_dva_throttle_enabled, "Throttle block allocations in the ZIO pipeline"); #endif diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4f556acde3b7..e52ab9078beb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -1,880 +1,880 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/posix] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos'] tags = ['functional', 'acl', 'posix'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos'] tags = ['functional', 'alloc_class'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos'] tags = ['functional', 'arc'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'atime_003_pos'] tags = ['functional', 'atime'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.list_children', 'tst.list_clones', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple'] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/chattr] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] [tests/functional/checksum] tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_001_neg', 'zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos', 'zfs_003_neg'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_006_pos', 'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_multi_mount'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'receive-o-x_props_override', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_remap] tests = ['zfs_remap_cliargs', 'zfs_remap_obsolete_counts'] tags = ['functional', 'cli_root', 'zfs_remap'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_005_pos', 'zfs_share_006_pos', 'zfs_share_007_neg', 'zfs_share_008_neg', 'zfs_share_009_neg', 'zfs_share_010_neg', 'zfs_share_011_pos'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_sysfs] tests = ['zfeature_set_unsupported.ksh', 'zfs_get_unsupported', 'zfs_set_unsupported', 'zfs_sysfs_live.ksh', 'zpool_get_unsupported', 'zpool_set_unsupported'] tags = ['functional', 'cli_root', 'zfs_sysfs'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_005_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift', 'add_nested_replacing_spare'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'create-o_ashift', 'zpool_create_tempname'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', 'zpool_events_poolname'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_expand] tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', 'import_cachefile_shared_device', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_reopen] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos','zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'arc_summary3_001_pos', 'dbufstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'compress_004_pos'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos'] tags = ['functional', 'cp_files'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/deadman] tests = ['deadman_sync', 'deadman_zio'] pre = post = tags = ['functional', 'deadman'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/devices] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] [tests/functional/events] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter'] tags = ['functional', 'events'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/fault] tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', 'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault', - 'decompress_fault'] + 'decompress_fault','zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos', 'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] tests = ['run_hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['sync', 'psync', 'libaio', 'posixaio', 'mmap'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos', 'inuse_005_pos', 'inuse_006_pos', 'inuse_007_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/largest_pool] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/link_count] tests = ['link_count_001'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_libaio_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mmp] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', 'mmp_on_zdb'] tags = ['functional', 'mmp'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/privilege] tests = ['privilege_001_pos', 'privilege_002_pos'] tags = ['functional', 'privilege'] [tests/functional/procfs] tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', 'procfs_list_stale_read', 'pool_state'] tags = ['functional', 'procfs'] [tests/functional/projectquota] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', 'projectspace_004_pos', 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg' ] tags = ['functional', 'projectquota'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', 'redundancy_004_neg'] tags = ['functional', 'redundancy'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_remap', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_export', 'removal_with_ganging', 'removal_with_remap', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy', 'send_encrypted_props', 'send_freeobjects', 'send_realloc_dnode_size', 'send_hole_birth', 'send-wDR_encrypted_zvol'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs', 'slog_replay_volume'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_015_pos', 'snapshot_016_pos', 'snapshot_017_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/tmpfile] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos'] tags = ['functional', 'tmpfile'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] [tests/functional/user_namespace] tests = ['user_namespace_001'] tags = ['functional', 'user_namespace'] [tests/functional/userquota] tests = [ 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userquota_013_pos', 'userspace_001_pos', 'userspace_002_pos', 'userspace_003_pos', 'groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos' ] tags = ['functional', 'userquota'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_008_pos', 'xattr_009_neg', 'xattr_010_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg', 'zvol_misc_004_pos', 'zvol_misc_005_neg', 'zvol_misc_006_pos', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos', 'zvol_swap_004_pos', 'zvol_swap_005_pos', 'zvol_swap_006_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index ec07db57d3de..5c68ea26f14a 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -1,17 +1,18 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fault dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ auto_online_001_pos.ksh \ auto_replace_001_pos.ksh \ auto_spare_001_pos.ksh \ auto_spare_002_pos.ksh \ auto_spare_ashift.ksh \ auto_spare_multiple.ksh \ auto_spare_shared.ksh \ decrypt_fault.ksh \ decompress_fault.ksh \ - scrub_after_resilver.ksh + scrub_after_resilver.ksh \ + zpool_status_-s.ksh dist_pkgdata_DATA = \ fault.cfg diff --git a/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh new file mode 100755 index 000000000000..b6a3e71fdfaf --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify zpool status -s (slow IOs) works +# +# STRATEGY: +# 1. Create a file +# 2. Inject slow IOs into the pool +# 3. Verify we can see the slow IOs with "zpool status -s". +# 4. Verify we can see delay events. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/zpool_script.shlib + +DISK=${DISKS%% *} + +verify_runnable "both" + +log_must zpool create $TESTPOOL mirror ${DISKS} + +function cleanup +{ + log_must zinject -c all + log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO + log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS + log_must destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool events -c + +# Mark any IOs greater than 10ms as slow IOs +OLD_SLOW_IO=$(get_tunable zio_slow_io_ms) +OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second) +log_must set_tunable64 zio_slow_io_ms 10 +log_must set_tunable64 zfs_slow_io_events_per_second 1000 + +# Create 20ms IOs +log_must zinject -d $DISK -D20:100 $TESTPOOL +log_must mkfile 1048576 /$TESTPOOL/testfile +log_must zpool sync $TESTPOOL + +log_must zinject -c all +SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}') +DELAY_EVENTS=$(zpool events | grep delay | wc -l) + +if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then + log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" +else + log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" +fi diff --git a/tests/zfs-tests/tests/perf/perf.shlib b/tests/zfs-tests/tests/perf/perf.shlib index 3802c6aec70f..7165df759b1f 100644 --- a/tests/zfs-tests/tests/perf/perf.shlib +++ b/tests/zfs-tests/tests/perf/perf.shlib @@ -1,492 +1,492 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2016 by Delphix. All rights reserved. # Copyright (c) 2016, Intel Corporation. # . $STF_SUITE/include/libtest.shlib # If neither is specified, do a nightly run. [[ -z $PERF_REGRESSION_WEEKLY ]] && export PERF_REGRESSION_NIGHTLY=1 # Default runtime for each type of test run. export PERF_RUNTIME_WEEKLY=$((30 * 60)) export PERF_RUNTIME_NIGHTLY=$((10 * 60)) # Default fs creation options export PERF_FS_OPTS=${PERF_FS_OPTS:-'-o recsize=8k -o compress=lz4' \ ' -o checksum=sha256 -o redundant_metadata=most'} function get_sync_str { typeset sync=$1 typeset sync_str='' [[ $sync -eq 0 ]] && sync_str='async' [[ $sync -eq 1 ]] && sync_str='sync' echo $sync_str } function get_suffix { typeset threads=$1 typeset sync=$2 typeset iosize=$3 typeset sync_str=$(get_sync_str $sync) typeset filesystems=$(get_nfilesystems) typeset suffix="$sync_str.$iosize-ios" suffix="$suffix.$threads-threads.$filesystems-filesystems" echo $suffix } function do_fio_run_impl { typeset script=$1 typeset do_recreate=$2 typeset clear_cache=$3 typeset threads=$4 typeset threads_per_fs=$5 typeset sync=$6 typeset iosize=$7 typeset sync_str=$(get_sync_str $sync) log_note "Running with $threads $sync_str threads, $iosize ios" if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then log_must test $do_recreate verify_threads_per_fs $threads $threads_per_fs fi if $do_recreate; then recreate_perf_pool # # A value of zero for "threads_per_fs" is "special", and # means a single filesystem should be used, regardless # of the number of threads. # if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then populate_perf_filesystems $((threads / threads_per_fs)) else populate_perf_filesystems 1 fi fi if $clear_cache; then # Clear the ARC zpool export $PERFPOOL zpool import $PERFPOOL fi if [[ -n $ZINJECT_DELAYS ]]; then apply_zinject_delays else log_note "No per-device commands to execute." fi # # Allow this to be overridden by the individual test case. This # can be used to run the FIO job against something other than # the default filesystem (e.g. against a clone). # export DIRECTORY=$(get_directory) log_note "DIRECTORY: " $DIRECTORY export RUNTIME=$PERF_RUNTIME export FILESIZE=$((TOTAL_SIZE / threads)) export NUMJOBS=$threads export SYNC_TYPE=$sync export BLOCKSIZE=$iosize sync # This will be part of the output filename. typeset suffix=$(get_suffix $threads $sync $iosize) # Start the data collection do_collect_scripts $suffix # Define output file typeset logbase="$(get_perf_output_dir)/$(basename \ $SUDO_COMMAND)" typeset outfile="$logbase.fio.$suffix" # Start the load log_must fio --output $outfile $FIO_SCRIPTS/$script } # # This function will run fio in a loop, according to the .fio file passed # in and a number of environment variables. The following variables can be # set before launching zfstest to override the defaults. # # PERF_RUNTIME: The time in seconds each fio invocation should run. # PERF_RUNTYPE: A human readable tag that appears in logs. The defaults are # nightly and weekly. # PERF_NTHREADS: A list of how many threads each fio invocation will use. # PERF_SYNC_TYPES: Whether to use (O_SYNC) or not. 1 is sync IO, 0 is async IO. # PERF_IOSIZES: A list of blocksizes in which each fio invocation will do IO. # PERF_COLLECT_SCRIPTS: A comma delimited list of 'command args, logfile_tag' # pairs that will be added to the scripts specified in each test. # function do_fio_run { typeset script=$1 typeset do_recreate=$2 typeset clear_cache=$3 typeset threads threads_per_fs sync iosize for threads in $PERF_NTHREADS; do for threads_per_fs in $PERF_NTHREADS_PER_FS; do for sync in $PERF_SYNC_TYPES; do for iosize in $PERF_IOSIZES; do do_fio_run_impl \ $script \ $do_recreate \ $clear_cache \ $threads \ $threads_per_fs \ $sync \ $iosize done done done done } # # This function iterates through the value pairs in $PERF_COLLECT_SCRIPTS. # The script at index N is launched in the background, with its output # redirected to a logfile containing the tag specified at index N + 1. # function do_collect_scripts { typeset suffix=$1 [[ -n $collect_scripts ]] || log_fail "No data collection scripts." [[ -n $PERF_RUNTIME ]] || log_fail "No runtime specified." # Add in user supplied scripts and logfiles, if any. typeset oIFS=$IFS IFS=',' for item in $PERF_COLLECT_SCRIPTS; do collect_scripts+=($(echo $item | sed 's/^ *//g')) done IFS=$oIFS typeset idx=0 while [[ $idx -lt "${#collect_scripts[@]}" ]]; do typeset logbase="$(get_perf_output_dir)/$(basename \ $SUDO_COMMAND)" typeset outfile="$logbase.${collect_scripts[$idx + 1]}.$suffix" timeout $PERF_RUNTIME ${collect_scripts[$idx]} >$outfile 2>&1 & ((idx += 2)) done # Need to explicitly return 0 because timeout(1) will kill # a child process and cause us to return non-zero. return 0 } # Find a place to deposit performance data collected while under load. function get_perf_output_dir { typeset dir="$(pwd)/perf_data" [[ -d $dir ]] || mkdir -p $dir echo $dir } function apply_zinject_delays { typeset idx=0 while [[ $idx -lt "${#ZINJECT_DELAYS[@]}" ]]; do [[ -n ${ZINJECT_DELAYS[$idx]} ]] || \ log_must "No zinject delay found at index: $idx" for disk in $DISKS; do log_must zinject \ -d $disk -D ${ZINJECT_DELAYS[$idx]} $PERFPOOL done ((idx += 1)) done } function clear_zinject_delays { log_must zinject -c all } # # Destroy and create the pool used for performance tests. # function recreate_perf_pool { [[ -n $PERFPOOL ]] || log_fail "The \$PERFPOOL variable isn't set." # # In case there's been some "leaked" zinject delays, or if the # performance test injected some delays itself, we clear all # delays before attempting to destroy the pool. Each delay # places a hold on the pool, so the destroy will fail if there # are any outstanding delays. # clear_zinject_delays # # This function handles the case where the pool already exists, # and will destroy the previous pool and recreate a new pool. # create_pool $PERFPOOL $DISKS } function verify_threads_per_fs { typeset threads=$1 typeset threads_per_fs=$2 log_must test -n $threads log_must test -n $threads_per_fs # # A value of "0" is treated as a "special value", and it is # interpreted to mean all threads will run using a single # filesystem. # [[ $threads_per_fs -eq 0 ]] && return # # The number of threads per filesystem must be a value greater # than or equal to zero; since we just verified the value isn't # 0 above, then it must be greater than zero here. # log_must test $threads_per_fs -ge 0 # # This restriction can be lifted later if needed, but for now, # we restrict the number of threads per filesystem to a value # that evenly divides the thread count. This way, the threads # will be evenly distributed over all the filesystems. # log_must test $((threads % threads_per_fs)) -eq 0 } function populate_perf_filesystems { typeset nfilesystems=${1:-1} export TESTFS="" for i in $(seq 1 $nfilesystems); do typeset dataset="$PERFPOOL/fs$i" create_dataset $dataset $PERF_FS_OPTS if [[ -z "$TESTFS" ]]; then TESTFS="$dataset" else TESTFS="$TESTFS $dataset" fi done } function get_nfilesystems { typeset filesystems=( $TESTFS ) echo ${#filesystems[@]} } function get_directory { typeset filesystems=( $TESTFS ) typeset directory= typeset idx=0 while [[ $idx -lt "${#filesystems[@]}" ]]; do mountpoint=$(get_prop mountpoint "${filesystems[$idx]}") if [[ -n $directory ]]; then directory=$directory:$mountpoint else directory=$mountpoint fi ((idx += 1)) done echo $directory } function get_max_arc_size { if is_linux; then typeset -l max_arc_size=`awk '$1 == "c_max" { print $3 }' \ /proc/spl/kstat/zfs/arcstats` else typeset -l max_arc_size=$(dtrace -qn 'BEGIN { printf("%u\n", `arc_stats.arcstat_c_max.value.ui64); exit(0); }') fi [[ $? -eq 0 ]] || log_fail "get_max_arc_size failed" echo $max_arc_size } function get_max_dbuf_cache_size { typeset -l max_dbuf_cache_size if is_linux; then max_dbuf_cache_size=$(get_tunable dbuf_cache_max_bytes) else max_dbuf_cache_size=$(dtrace -qn 'BEGIN { printf("%u\n", `dbuf_cache_max_bytes); exit(0); }') [[ $? -eq 0 ]] || log_fail "get_max_dbuf_cache_size failed" fi echo $max_dbuf_cache_size } # Create a file with some information about how this system is configured. function get_system_config { typeset config=$PERF_DATA_DIR/$1 echo "{" >>$config if is_linux; then echo " \"ncpus\": \"$(nproc --all)\"," >>$config echo " \"physmem\": \"$(free -b | \ awk '$1 == "Mem:" { print $2 }')\"," >>$config echo " \"c_max\": \"$(get_max_arc_size)\"," >>$config echo " \"hostname\": \"$(uname -n)\"," >>$config echo " \"kernel version\": \"$(uname -sr)\"," >>$config else dtrace -qn 'BEGIN{ printf(" \"ncpus\": %d,\n", `ncpus); printf(" \"physmem\": %u,\n", `physmem * `_pagesize); printf(" \"c_max\": %u,\n", `arc_stats.arcstat_c_max.value.ui64); printf(" \"kmem_flags\": \"0x%x\",", `kmem_flags); exit(0)}' >>$config echo " \"hostname\": \"$(uname -n)\"," >>$config echo " \"kernel version\": \"$(uname -v)\"," >>$config fi if is_linux; then lsblk -dino NAME,SIZE | awk 'BEGIN { printf(" \"disks\": {\n"); first = 1} {disk = $1} {size = $2; if (first != 1) {printf(",\n")} else {first = 0} printf(" \"%s\": \"%s\"", disk, size)} END {printf("\n },\n")}' >>$config zfs_tunables="/sys/module/zfs/parameters" printf " \"tunables\": {\n" >>$config for tunable in \ zfs_arc_max \ zfs_arc_meta_limit \ zfs_arc_sys_free \ zfs_dirty_data_max \ zfs_flags \ zfs_prefetch_disable \ zfs_txg_timeout \ zfs_vdev_aggregation_limit \ zfs_vdev_async_read_max_active \ zfs_vdev_async_write_max_active \ zfs_vdev_sync_read_max_active \ zfs_vdev_sync_write_max_active \ - zio_delay_max + zio_slow_io_ms do if [ "$tunable" != "zfs_arc_max" ] then printf ",\n" >>$config fi printf " \"$tunable\": \"$(<$zfs_tunables/$tunable)\"" \ >>$config done printf "\n }\n" >>$config else iostat -En | awk 'BEGIN { printf(" \"disks\": {\n"); first = 1} /^c/ {disk = $1} /^Size: [^0]/ {size = $2; if (first != 1) {printf(",\n")} else {first = 0} printf(" \"%s\": \"%s\"", disk, size)} END {printf("\n },\n")}' >>$config sed -n 's/^set \(.*\)[ ]=[ ]\(.*\)/\1=\2/p' /etc/system | \ awk -F= 'BEGIN {printf(" \"system\": {\n"); first = 1} {if (first != 1) {printf(",\n")} else {first = 0}; printf(" \"%s\": %s", $1, $2)} END {printf("\n }\n")}' >>$config fi echo "}" >>$config } function num_jobs_by_cpu { if is_linux; then typeset ncpu=$($NPROC --all) else typeset ncpu=$(psrinfo | $WC -l) fi typeset num_jobs=$ncpu [[ $ncpu -gt 8 ]] && num_jobs=$(echo "$ncpu * 3 / 4" | bc) echo $num_jobs } # # On illumos this looks like: ":sd3:sd4:sd1:sd2:" # function pool_to_lun_list { typeset pool=$1 typeset ctd ctds devname lun typeset lun_list=':' if is_linux; then ctds=$(zpool list -HLv $pool | \ awk '/sd[a-z]*|loop[0-9]*|dm-[0-9]*/ {print $1}') for ctd in $ctds; do lun_list="$lun_list$ctd:" done else ctds=$(zpool list -v $pool | awk '/c[0-9]*t[0-9a-fA-F]*d[0-9]*/ {print $1}') for ctd in $ctds; do # Get the device name as it appears in /etc/path_to_inst devname=$(readlink -f /dev/dsk/${ctd}s0 | sed -n \ 's/\/devices\([^:]*\):.*/\1/p') # Add a string composed of the driver name and instance # number to the list for comparison with dev_statname. lun=$(sed 's/"//g' /etc/path_to_inst | grep \ $devname | awk '{print $3$2}') un_list="$lun_list$lun:" done fi echo $lun_list } # Create a perf_data directory to hold performance statistics and # configuration information. export PERF_DATA_DIR=$(get_perf_output_dir) [[ -f $PERF_DATA_DIR/config.json ]] || get_system_config config.json