Index: vendor/illumos/dist/cmd/zpool/zpool_main.c =================================================================== --- vendor/illumos/dist/cmd/zpool/zpool_main.c (revision 342531) +++ vendor/illumos/dist/cmd/zpool/zpool_main.c (revision 342532) @@ -1,6129 +1,6170 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_STATUS, HELP_UPGRADE, HELP_GET, HELP_SET, HELP_SPLIT, HELP_REGUID, HELP_REOPEN } zpool_help_t; typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fn] ...\n")); case HELP_ATTACH: return (gettext("\tattach [-f] " "\n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-B] " "[-o property=value] ... \n" - "\t [-O file-system-property=value] ... \n" - "\t [-m mountpoint] [-R root] ...\n")); + "\t [-O file-system-property=value] ...\n" + "\t [-m mountpoint] [-R root] [-t tempname] " + " ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [--discard] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-f] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " - "[-R root] [-F [-n]]\n" + "[-R root] [-F [-n]] [-t]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval " "[count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-o property[,...]] " "[-T d|u] [pool] ... [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline ...\n")); case HELP_REPLACE: return (gettext("\treplace [-f] " "[new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-nps] ...\n")); case HELP_REOPEN: return (gettext("\treopen \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-cs] [ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] ...\n")); case HELP_STATUS: return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " "[count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); } abort(); /* NOTREACHED */ } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-15s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-15s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(5).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, boolean_t print_logs) { nvlist_t **child; uint_t c, children; char *vname; if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if ((is_log && !print_logs) || (!is_log && print_logs)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_vdev_tree(zhp, vname, child[c], indent + 2, B_FALSE); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPROP_INVAL; zfs_prop_t fprop; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { normnm = zfs_prop_to_name(fprop); } else { normnm = propname; } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* + * Set a default property pair (name, string-value) in a property nvlist + */ +static int +add_prop_list_default(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + char *pval; + + if (nvlist_lookup_string(*props, propname, &pval) == 0) + return (0); + + return (add_prop_list(propname, propval, props, poolprop)); +} + +/* * zpool add [-fn] ... * * -f Force addition of devices, even if they appear in use * -n Do not add the devices, but display the resulting layout if * they were to be added. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int c; nvlist_t *nvroot; char *poolname; zpool_boot_label_t boot_type; uint64_t boot_size; int ret; zpool_handle_t *zhp; nvlist_t *config; /* check options */ while ((c = getopt(argc, argv, "fn")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } if (zpool_is_bootable(zhp)) boot_type = ZPOOL_COPY_BOOT_LABEL; else boot_type = ZPOOL_NO_BOOT_LABEL; /* pass off to get_vdev_spec for processing */ boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL); nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun, boot_type, boot_size, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE); print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE); /* Do the same for the logs */ if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE); print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE); } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp; boolean_t stop = B_FALSE; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; char c; /* check options */ while ((c = getopt(argc, argv, "nps")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } } return (ret); } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; struct stat st; int c, fd, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending dsk path and appending s0. */ (void) strlcpy(vdev, argv[0], sizeof (vdev)); if (vdev[0] != '/' && stat(vdev, &st) != 0) { char *s; (void) snprintf(vdev, sizeof (vdev), "%s/%s", ZFS_DISK_ROOT, argv[0]); if ((s = strrchr(argv[0], 's')) == NULL || !isdigit(*(s + 1))) (void) strlcat(vdev, "s0", sizeof (vdev)); if (stat(vdev, &st) != 0) { (void) fprintf(stderr, gettext( "failed to find device %s, try specifying absolute " "path instead\n"), argv[0]); return (1); } } if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("failed to open %s: %s\n"), vdev, strerror(errno)); return (1); } if (zpool_read_label(fd, &config) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); return (1); } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); return (1); } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\"\n"), vdev, zpool_pool_state_to_name(state), name); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-B] [-o property=value] ... * [-O file-system-property=value] ... - * [-R root] [-m mountpoint] ... + * [-R root] [-m mountpoint] [-t tempname] ... * * -B Create boot partition. * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. - * -R Create a pool under an alternate root - * -m Set default mountpoint for the root dataset. By default it's + * -R Create a pool under an alternate root + * -m Set default mountpoint for the root dataset. By default it's * '/' + * -t Use the temporary name until the pool is exported. * -o Set property=value. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once * we get the nvlist back from get_vdev_spec(), we either print out the contents * (if '-n' was specified), or pass it to libzfs to do the creation. */ #define SYSTEM256 (256 * 1024 * 1024) int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_all_pool_feat = B_TRUE; zpool_boot_label_t boot_type = ZPOOL_NO_BOOT_LABEL; uint64_t boot_size = 0; int c; nvlist_t *nvroot = NULL; char *poolname; + char *tname = NULL; int ret = 1; char *altroot = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ - while ((c = getopt(argc, argv, ":fndBR:m:o:O:")) != -1) { + while ((c = getopt(argc, argv, ":fndBR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_all_pool_feat = B_FALSE; break; case 'B': /* * We should create the system partition. * Also make sure the size is set. */ boot_type = ZPOOL_CREATE_BOOT_LABEL; if (boot_size == 0) boot_size = SYSTEM256; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; - if (nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), - &propval) == 0) - break; - if (add_prop_list(zpool_prop_to_name( + if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * Get bootsize value for make_root_vdev(). */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_BOOTSIZE) { if (zfs_nicestrtonum(g_zfs, propval, &boot_size) < 0 || boot_size == 0) { (void) fprintf(stderr, gettext("bad boot partition size " "'%s': %s\n"), propval, libzfs_error_description(g_zfs)); goto errout; } } /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_all_pool_feat = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; + case 't': + /* + * Sanity check temporary pool name. + */ + if (strchr(optarg, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create " + "'%s': invalid character '/' in temporary " + "name\n"), optarg); + (void) fprintf(stderr, gettext("use 'zfs " + "create' to create a dataset\n")); + goto errout; + } + + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) + goto errout; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + tname = optarg; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* * Make sure the bootsize is set when ZPOOL_CREATE_BOOT_LABEL is used, * and not set otherwise. */ if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { const char *propname; char *strptr, *buf = NULL; int rv; propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE); if (nvlist_lookup_string(props, propname, &strptr) != 0) { (void) asprintf(&buf, "%" PRIu64, boot_size); if (buf == NULL) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); goto errout; } rv = add_prop_list(propname, buf, &props, B_TRUE); free(buf); if (rv != 0) goto errout; } } else { const char *propname; char *strptr; propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE); if (nvlist_lookup_string(props, propname, &strptr) == 0) { (void) fprintf(stderr, gettext("error: setting boot " "partition size requires option '-B'\n")); goto errout; } } /* pass off to get_vdev_spec for bulk processing */ nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun, boot_type, boot_size, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE); if (num_logs(nvroot) > 0) print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE); ret = 0; } else { /* * Hand off to libzfs. */ if (enable_all_pool_feat) { spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); /* * Skip feature if user specified it manually * on the command line. */ if (nvlist_exists(props, propname)) continue; ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { - zfs_handle_t *pool = zfs_open(g_zfs, poolname, - ZFS_TYPE_FILESYSTEM); + zfs_handle_t *pool = zfs_open(g_zfs, + tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * - * -f Forcefully unmount any datasets + * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } /* * zpool export [-f] ... * * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c; zpool_handle_t *zhp; int ret; int i; /* check options */ while ((c = getopt(argc, argv, "fF")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = 0; for (i = 0; i < argc; i++) { if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) { ret = 1; continue; } if (zpool_disable_datasets(zhp, force) != 0) { ret = 1; zpool_close(zhp); continue; } /* The history must be logged as part of the export */ log_history = B_FALSE; if (hardforce) { if (zpool_export_force(zhp, history_str) != 0) ret = 1; } else if (zpool_export(zhp, force, history_str) != 0) { ret = 1; } zpool_close(zhp); } return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) { char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE); nvlist_t **child; uint_t c, children; int ret; if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } /* * Print out configuration state as requested by status_callback. */ void print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int namewidth, int depth, boolean_t isspare) { nvlist_t **child; uint_t c, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t cb; const char *state; char *type; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, name, state); if (!isspare) { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { char *path; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" was %s", path); } else if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &cb) == 1) { if (strcmp(zpool_get_name(cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(cb.cb_zhp)); zpool_close(cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps && ps->pss_state == DSS_SCANNING && vs->vs_scan_processed != 0 && children == 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", suspended, started at %s", tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", started at %s", tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", completed at %s", tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); print_status_config(zhp, vname, child[c], namewidth, depth + 2, isspare); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ void print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE); print_import_config(vname, child[c], namewidth, depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print log vdevs. * Logs are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1. We use either print_status_config() or * print_import_config() to print the top level logs then any log * children (eg mirrored slogs) are printed recursively - which * works because only the top level vdev is marked "is_log" */ static void print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose) { uint_t c, children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; (void) printf(gettext("\tlogs\n")); for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *name; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (!is_log) continue; name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); if (verbose) print_status_config(zhp, name, child[c], namewidth, 2, B_FALSE); else print_import_config(name, child[c], namewidth, 2); free(name); } } /* * Display the status for the given pool. */ static void show_import(nvlist_t *config) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; char *msgid; nvlist_t *nvroot; int reason; const char *health; uint_t vsc; int namewidth; char *comment; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" status: One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext(" status: One or more devices contains " "corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext(" status: One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext(" status: The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext(" status: The pool is formatted using a " "legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" status: The pool is formatted using an " "incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext(" status: Some supported features are " "not enabled on the pool.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool uses the following " "feature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext(" status: One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext(" status: An intent log record cannot be " "read.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext(" status: One or more devices were being " "resilvered.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("action: The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("action: The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); (void) printf(gettext(" config:\n\n")); namewidth = max_width(NULL, nvroot, 0, 0); if (namewidth < 10) namewidth = 10; print_import_config(name, nvroot, namewidth, 0); if (num_logs(nvroot) > 0) print_logs(NULL, nvroot, namewidth, B_FALSE); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { zpool_handle_t *zhp; char *name; uint64_t state; uint64_t version; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (state != POOL_STATE_EXPORTED && !(flags & ZFS_IMPORT_ANY_HOST)) { uint64_t hostid; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { if ((unsigned long)hostid != gethostid()) { char *hostname; uint64_t timestamp; time_t t; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, ×tamp) == 0); t = timestamp; (void) fprintf(stderr, gettext("cannot import " "'%s': pool may be in use from other " "system, it was last accessed by %s " "(hostid: 0x%lx) on %s"), name, hostname, (unsigned long)hostid, asctime(localtime(&t))); (void) fprintf(stderr, gettext("use '-f' to " "import anyway\n")); return (1); } } else { (void) fprintf(stderr, gettext("cannot import '%s': " "pool may be in use from other system\n"), name); (void) fprintf(stderr, gettext("use '-f' to import " "anyway\n")); return (1); } } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (0); } /* * zpool checkpoint * checkpoint --discard * - * -d Discard the checkpoint from a checkpointed - * --discard pool. + * -d Discard the checkpoint from a checkpointed + * --discard pool. * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {0, 0, 0, 0} }; discard = B_FALSE; while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) err = (zpool_discard_checkpoint(zhp) != 0); else err = (zpool_checkpoint(zhp) != 0); zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] * [-d dir | -c cachefile] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] - * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] + * [-d dir | -c cachefile] [-f] [-n] [-F] [-t] + * [newpool] * - * -c Read pool information from a cachefile instead of searching + * -c Read pool information from a cachefile instead of searching * devices. * - * -d Scan in a specific directory, other than /dev/dsk. More than + * -d Scan in a specific directory, other than /dev/dsk. More than * one directory can be specified using multiple '-d' options. * - * -D Scan for previously destroyed pools or import all or only - * specified destroyed pools. + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. * - * -R Temporarily import the pool, with all mountpoints relative to + * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * - * -V Import even in the presence of faulted vdevs. This is an - * intentionally undocumented option for testing purposes, and - * treats the pool configuration as complete, leaving any bad + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * - * -f Force import, even if it appears that the pool is active. + * -f Force import, even if it appears that the pool is active. * - * -F Attempt rewind if necessary. + * -F Attempt rewind if necessary. * - * -n See if rewind would work, but don't actually rewind. + * -n See if rewind would work, but don't actually rewind. * - * -N Import the pool but don't mount datasets. + * -N Import the pool but don't mount datasets. * - * -T Specify a starting txg to use for import. This option is - * intentionally undocumented option for testing purposes. + * -t Use newpool as a temporary pool name instead of renaming + * the pool. * - * -a Import all pools found. + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. * - * -o Set property=value and/or temporary mount options (without '='). + * -a Import all pools found. * - * --rewind-to-checkpoint - * Import the pool and revert back to the checkpoint. + * -o Set property=value and/or temporary mount options (without '='). * + * --rewind-to-checkpoint + * Import the pool and revert back to the checkpoint. + * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ - while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX", + while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:tT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; - if (nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), - &propval) == 0) - break; - if (add_prop_list(zpool_prop_to_name( + if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; + case 't': + flags |= ZFS_IMPORT_TEMP_NAME; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); searchdirs[0] = ZFS_DISK_ROOT; nsearch = 1; } /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check for the SYS_CONFIG privilege. We do this explicitly * here because otherwise any attempt to discover pools will * silently fail. */ if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); free(searchdirs); nvlist_free(policy); return (1); } } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } found_config = NULL; /* * User specified a name or guid. Ensure it's unique. */ idata.unique = B_TRUE; } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.policy = policy; pools = zpool_search_import(g_zfs, &idata); if (pools != NULL && idata.exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); - (void) fprintf(stderr, gettext("use the form '%s " - " ' to give it a new name\n"), - "zpool import"); + (void) fprintf(stderr, gettext("use the form 'zpool import " + "[-t] ' to give it a new temporary " + "or permanent name\n")); err = 1; } else if (pools == NULL && idata.exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { free(searchdirs); nvlist_free(policy); return (1); } /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ err = 0; elem = NULL; first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, policy) == 0); if (argc == 0) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { show_import(config); } } else if (searchname != NULL) { char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, searchname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), searchname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == searchguid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (argc != 0 && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : argv[1], mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (argc == 0 && first) (void) fprintf(stderr, gettext("no pools available to import\n")); error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); free(searchdirs); return (err ? 1 : 0); } typedef struct iostat_cbdata { boolean_t cb_verbose; int cb_namewidth; int cb_iteration; zpool_list_t *cb_list; } iostat_cbdata_t; static void print_iostat_separator(iostat_cbdata_t *cb) { int i = 0; for (i = 0; i < cb->cb_namewidth; i++) (void) printf("-"); (void) printf(" ----- ----- ----- ----- ----- -----\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { (void) printf("%*s capacity operations bandwidth\n", cb->cb_namewidth, ""); (void) printf("%-*s alloc free read write read write\n", cb->cb_namewidth, "pool"); print_iostat_separator(cb); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value) { char buf[64]; zfs_nicenum(value, buf, sizeof (buf)); (void) printf(" %5s", buf); } /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. */ void print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children; vdev_stat_t *oldvs, *newvs; vdev_stat_t zerovs = { 0 }; uint64_t tdelta; double scale; char *vname; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; /* only toplevel vdevs have capacity stats */ if (newvs->vs_space == 0) { (void) printf(" - -"); } else { print_one_stat(newvs->vs_alloc); print_one_stat(newvs->vs_space - newvs->vs_alloc); } print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - oldvs->vs_ops[ZIO_TYPE_READ]))); print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - oldvs->vs_ops[ZIO_TYPE_WRITE]))); print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - oldvs->vs_bytes[ZIO_TYPE_READ]))); print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - oldvs->vs_bytes[ZIO_TYPE_WRITE]))); (void) printf("\n"); if (!cb->cb_verbose) return; if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return; if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &c) != 0) return; for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * Log device section */ if (num_logs(newnv) > 0) { (void) printf("%-*s - - - - - " "-\n", cb->cb_namewidth, "logs"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return; if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &c) != 0) return; if (children > 0) { (void) printf("%-*s - - - - - " "-\n", cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); /* * Print out the statistics for the pool. */ print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if (cb->cb_verbose) print_iostat_separator(cb); return (0); } int get_namewidth(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *config, *nvroot; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (!cb->cb_verbose) cb->cb_namewidth = strlen(zpool_get_name(zhp)); else cb->cb_namewidth = max_width(zhp, nvroot, 0, cb->cb_namewidth); } /* * The width must fall into the range [10,38]. The upper limit is the * maximum we can have and still fit in 80 columns. */ if (cb->cb_namewidth < 10) cb->cb_namewidth = 10; if (cb->cb_namewidth > 38) cb->cb_namewidth = 38; return (0); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, unsigned long *iv, unsigned long *cnt) { unsigned long interval = 0, count = 0; int argc = *argcp, errno; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && isdigit(argv[argc - 1][0])) { char *end; errno = 0; interval = strtoul(argv[argc - 1], &end, 10); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && isdigit(argv[argc - 1][0])) { char *end; errno = 0; count = interval; interval = strtoul(argv[argc - 1], &end, 10); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]] * * -v Display statistics for individual vdevs * -T Display a timestamp in date(1) or Unix format * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; unsigned long interval = 0, count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; iostat_cbdata_t cb; /* check options */ while ((c = getopt(argc, argv, "T:v")) != -1) { switch (c) { case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; cb.cb_verbose = verbose; cb.cb_iteration = 0; cb.cb_namewidth = 0; for (;;) { pool_list_update(list); if ((npools = pool_list_count(list)) == 0) break; /* * Refresh all statistics. This is done as an explicit step * before calculating the maximum name width, so that any * configuration changes are properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); /* * If it's the first time, or verbose mode, print the header. */ if (++cb.cb_iteration == 1 || verbose) print_iostat_header(&cb); (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in verbose mode * (which prints a separator for us), then print a separator. */ if (npools > 1 && !verbose) print_iostat_separator(&cb); if (verbose) (void) printf("\n"); /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) sleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", width, header); else (void) printf("%-*s", width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", width, propstr); else (void) printf("%-*s", width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, boolean_t valid) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum(value, propval, sizeof (propval)); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", value); } break; case ZPOOL_PROP_CAPACITY: (void) snprintf(propval, sizeof (propval), "%llu%%", value); break; default: zfs_nicenum(value, propval, sizeof (propval)); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", width, propval); } void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; boolean_t haslog = B_FALSE; char *dashes = "%-*s - - - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted, toplevel); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted, toplevel); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, scripted, toplevel); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, scripted, toplevel); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted, B_TRUE); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel)); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 100 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { haslog = B_TRUE; continue; } vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } if (haslog == B_TRUE) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "log"); for (c = 0; c < children; c++) { if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) != 0 || !islog) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } } /* * Generic callback function to list a pool. */ int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; nvlist_t *config; nvlist_t *nvroot; config = zpool_get_config(zhp, NULL); print_pool(zhp, cbp); if (!cbp->cb_verbose) return (0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } /* * zpool list [-Hp] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" - * -p Diplay values in parsable (exact) format. + * -p Diplay values in parsable (exact) format. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; unsigned long interval = 0, count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) { switch (c) { case 'H': cb.cb_scripted = B_TRUE; break; case 'o': props = optarg; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) sleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; zpool_boot_label_t boot_type; uint64_t boot_size; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } if (zpool_is_bootable(zhp)) boot_type = ZPOOL_COPY_BOOT_LABEL; else boot_type = ZPOOL_NO_BOOT_LABEL; boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL); nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE, boot_type, boot_size, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-f] * * -f Force attach, even if appears to be in use. * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-f] * * -f Force attach, even if appears to be in use. * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-n] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -R Mount the split-off pool under an alternate root. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; /* check options */ while ((c = getopt(argc, argv, ":R:no:")) != -1) { switch (c) { case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) return (1); config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, B_FALSE); } nvlist_free(config); } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) return (ret); /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) return (1); if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "et")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into the offline state, even if doing * so would appear to compromise pool availability. * (not supported yet) * * -t Only take the device off-line temporarily. The offline * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 't': istmp = B_TRUE; break; case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; zpool_handle_t *zhp; char *pool; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc--; argv++; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) return (1); ret = zpool_reopen(zhp); zpool_close(zhp); return (ret); } typedef struct scrub_cbdata { int cb_type; int cb_argc; char **cb_argv; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } /* * zpool scrub [-s | -p] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "sp")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (cb.cb_type == POOL_SCAN_NONE && cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { (void) fprintf(stderr, gettext("invalid option combination: " "-s and -p are mutually exclusive\n")); usage(B_FALSE); } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE); fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * zpool initialize [-cs] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO; while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_DO) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_DO) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { int i; for (i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } typedef struct status_cbdata { int cb_count; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; } status_cbdata_t; /* * Print out detailed scrub status. */ static void print_scan_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t elapsed, mins_left, hours_left; uint64_t pass_exam, examined, total; uint_t rate; double fraction_done; char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; (void) printf(gettext(" scan: ")); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* * Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; char *fmt = NULL; if (ps->pss_func == POOL_SCAN_SCRUB) { fmt = gettext("scrub repaired %s in %lluh%um with " "%llu errors on %s"); } else if (ps->pss_func == POOL_SCAN_RESILVER) { fmt = gettext("resilvered %s in %lluh%um with " "%llu errors on %s"); } /* LINTED */ (void) printf(fmt, processed_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), (u_longlong_t)ps->pss_errors, ctime((time_t *)&end)); return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* * Scan is in progress. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { char buf[32]; struct tm *p = localtime(&pause); (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p); (void) printf(gettext("scrub paused since %s\n"), buf); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } examined = ps->pss_examined ? ps->pss_examined : 1; total = ps->pss_to_examine; fraction_done = (double)examined / total; /* elapsed time for this pass */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = elapsed ? elapsed : 1; pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; rate = pass_exam / elapsed; rate = rate ? rate : 1; mins_left = ((total - examined) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); /* * do not print estimated time if hours_left is more than 30 days * or we have a paused scrub */ if (pause == 0) { zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); (void) printf(gettext("\t%s scanned out of %s at %s/s"), examined_buf, total_buf, rate_buf); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (scan is slow, no estimated time)\n")); } } else { (void) printf(gettext("\t%s scanned out of %s\n"), examined_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext(" %s resilvered, %.2f%% done\n"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext(" %s repaired, %.2f%% done\n"), processed_buf, 100 * fraction_done); } } /* * As we don't scrub checkpointed blocks, we want to warn the * user that we skipped scanning some blocks if a checkpoint exists * or existed at any time during the scan. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); (void) printf(gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext(" %s copied out of %s at %s/s, " "%.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext(" %s memory used for " "removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), strlen(date) - 1, date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) { (void) printf("errors: List of errors unavailable " "(insufficient privileges)\n"); return; } (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, int namewidth) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE); print_status_config(zhp, name, spares[i], namewidth, 2, B_TRUE); free(name); } } static void print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, int namewidth) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE); print_status_config(zhp, name, l2cache[i], namewidth, 2, B_FALSE); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", (u_longlong_t)ddo->ddo_count, (u_longlong_t)ddo->ddo_dspace, (u_longlong_t)ddo->ddo_mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: http://illumos.org/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; int reason; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); (void) printf(gettext(" state: %s\n"), health); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: (void) printf(gettext("status: One or more devices could not " "be opened. Sufficient replicas exist for\n\tthe pool to " "continue functioning in a degraded state.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: (void) printf(gettext("status: One or more devices could not " "be opened. There are insufficient\n\treplicas for the " "pool to continue functioning.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); (void) printf(gettext("action: Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: (void) printf(gettext("status: One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); (void) printf(gettext("action: Determine if the device needs " "to be replaced, and clear the errors\n\tusing " "'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext("status: One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: (void) printf(gettext("status: One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); (void) printf(gettext("action: Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf(gettext("status: One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); (void) printf(gettext("action: Restore the file in question " "if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext("status: The pool metadata is corrupted " "and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext("status: The pool is formatted using a " "legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); (void) printf(gettext("action: Upgrade the pool using 'zpool " "upgrade'. Once this is done, the\n\tpool will no longer " "be accessible on software that does not support feature\n" "\tflags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext("status: The pool has been upgraded to a " "newer, incompatible on-disk version.\n\tThe pool cannot " "be accessed on this system.\n")); (void) printf(gettext("action: Access the pool from a system " "running more recent software, or\n\trestore the pool from " "backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext("status: Some supported features are not " "enabled on the pool. The pool can\n\tstill be used, but " "some features are unavailable.\n")); (void) printf(gettext("action: Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(5) for details.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool cannot be accessed on " "this system because it uses the\n\tfollowing feature(s) " "not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: Access the pool from a system " "that supports the required feature(s),\n\tor restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: The pool cannot be accessed in " "read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); (void) printf(gettext("action: Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); (void) printf(gettext("action: Destroy and re-create the pool " "from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: (void) printf(gettext("status: One or more devices are " "faulted in response to IO failures.\n")); (void) printf(gettext("action: Make sure the affected devices " "are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext("status: An intent log record " "could not be read.\n" "\tWaiting for adminstrator intervention to fix the " "faulted pool.\n")); (void) printf(gettext("action: Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); if (config != NULL) { int namewidth; uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_scan_status(ps); print_checkpoint_scan_warning(ps, pcs); print_removal_status(zhp, prs); print_checkpoint_status(pcs); namewidth = max_width(zhp, nvroot, 0, 0); if (namewidth < 10) namewidth = 10; (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); print_status_config(zhp, zpool_get_name(zhp), nvroot, namewidth, 0, B_FALSE); if (num_logs(nvroot) > 0) print_logs(zhp, nvroot, namewidth, B_TRUE); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, l2cache, nl2cache, namewidth); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, spares, nspares, namewidth); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-vx] [-T d|u] [pool] ... [interval [count]] * * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; unsigned long interval = 0, count = 0; status_cbdata_t cb = { 0 }; /* check options */ while ((c = getopt(argc, argv, "vxDT:")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); if (argc == 0 && cb.cb_count == 0) (void) printf(gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) sleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), oldversion, version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!nvlist_exists(enabled, fguid)) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t printnl = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); printnl = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; printnl = B_TRUE; } } if (printnl) { (void) printf(gettext("\n")); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(5) for " "details.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) printf(gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { printnl = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { printnl = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported features enabled.\n"), zpool_get_name(zhp)); } } if (printnl) { (void) printf(gettext("\n")); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported features " "enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; int ret, i; hist_cbdata_t *cb = (hist_cbdata_t *)data; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); if ((ret = zpool_get_history(zhp, &nvhis)) != 0) return (ret); verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[30] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } (void) printf("\n"); nvlist_free(nvhis); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) printf(gettext("no pools available\n")); return (0); } return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". - * -p Diplay values in parsable (exact) format. + * -p Diplay values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, set_callback, &cb); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } int main(int argc, char **argv) { int ret = 0; int i; char *cmdname; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, gettext("internal error: failed to " "initialize ZFS library\n")); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if (strcmp(cmdname, "-?") == 0) usage(B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, argv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, argv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ char buf[16384]; int fd = open(ZFS_DEV, O_RDWR); (void) strcpy((void *)buf, argv[2]); return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf)); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); } if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } Index: vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c =================================================================== --- vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c (revision 342531) +++ vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c (revision 342532) @@ -1,4425 +1,4436 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" #include "zfeature_common.h" static int read_efi_label(nvlist_t *, diskaddr_t *, boolean_t *); static boolean_t zpool_vdev_is_interior(const char *name); #define BACKUP_SLICE "s2" typedef struct prop_flags { int create:1; /* Validate property on creation */ int import:1; /* Validate property on import */ } prop_flags_t; /* * ==================================================================== * zpool property functions * ==================================================================== */ static int zpool_get_all_props(zpool_handle_t *zhp) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } else { zcmd_free_nvlists(&zc); return (-1); } } if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { zcmd_free_nvlists(&zc); return (-1); } zcmd_free_nvlists(&zc); return (0); } static int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; old_props = zhp->zpool_props; if (zpool_get_all_props(zhp) != 0) return (-1); nvlist_free(old_props); return (0); } static char * zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t ival; char *value; zprop_source_t source; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0); source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; if ((value = (char *)zpool_prop_default_string(prop)) == NULL) value = "-"; } if (src) *src = source; return (value); } uint64_t zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t value; zprop_source_t source; if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { /* * zpool_get_all_props() has most likely failed because * the pool is faulted, but if all we need is the top level * vdev's guid then get it from the zhp config nvlist. */ if ((prop == ZPOOL_PROP_GUID) && (nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0)) { return (value); } return (zpool_prop_default_numeric(prop)); } nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0); source = value; verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; value = zpool_prop_default_numeric(prop); } if (src) *src = source; return (value); } /* * Map VDEV STATE to printed strings. */ const char * zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) { switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); case VDEV_STATE_REMOVED: return (gettext("REMOVED")); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); else if (aux == VDEV_AUX_SPLIT_POOL) return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); default: break; } return (gettext("UNKNOWN")); } /* * Map POOL STATE to printed strings. */ const char * zpool_pool_state_to_name(pool_state_t state) { switch (state) { case POOL_STATE_ACTIVE: return (gettext("ACTIVE")); case POOL_STATE_EXPORTED: return (gettext("EXPORTED")); case POOL_STATE_DESTROYED: return (gettext("DESTROYED")); case POOL_STATE_SPARE: return (gettext("SPARE")); case POOL_STATE_L2CACHE: return (gettext("L2CACHE")); case POOL_STATE_UNINITIALIZED: return (gettext("UNINITIALIZED")); case POOL_STATE_UNAVAIL: return (gettext("UNAVAIL")); case POOL_STATE_POTENTIALLY_ACTIVE: return (gettext("POTENTIALLY_ACTIVE")); } return (gettext("UNKNOWN")); } /* * Get a zpool property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; zprop_source_t src = ZPROP_SRC_NONE; nvlist_t *nvroot; vdev_stat_t *vs; uint_t vsc; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { switch (prop) { case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, "FAULTED", len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); (void) snprintf(buf, len, "%llu", intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; } /* FALLTHROUGH */ default: (void) strlcpy(buf, "-", len); break; } if (srctype != NULL) *srctype = src; return (0); } if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); switch (zpool_prop_get_type(prop)) { case PROP_TYPE_STRING: (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; case PROP_TYPE_NUMBER: intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: case ZPOOL_PROP_LEAKED: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case ZPOOL_PROP_BOOTSIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case ZPOOL_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_DEDUPRATIO: (void) snprintf(buf, len, "%llu.%02llux", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); break; case ZPOOL_PROP_HEALTH: verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); break; case ZPOOL_PROP_VERSION: if (intval >= SPA_VERSION_FEATURES) { (void) snprintf(buf, len, "-"); break; } /* FALLTHROUGH */ default: (void) snprintf(buf, len, "%llu", intval); } break; case PROP_TYPE_INDEX: intval = zpool_get_prop_int(zhp, prop, &src); if (zpool_prop_index_to_string(prop, intval, &strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. */ static boolean_t bootfs_name_valid(const char *pool, char *bootfs) { int len = strlen(pool); if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) return (B_FALSE); if (strncmp(pool, bootfs, len) == 0 && (bootfs[len] == '/' || bootfs[len] == '\0')) return (B_TRUE); return (B_FALSE); } boolean_t zpool_is_bootable(zpool_handle_t *zhp) { char bootfs[ZFS_MAX_DATASET_NAME_LEN]; return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-", sizeof (bootfs)) != 0); } /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are * specified as strings. */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; char *strval; uint64_t intval; char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zpool_name_to_prop(propname); if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { int err; char *fname = strchr(propname, '@') + 1; err = zfeature_lookup_name(fname, NULL); if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid feature '%s'"), fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'enabled'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_uint64(retprops, propname, 0) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Make sure this property is valid and applies to this type. */ if (prop == ZPOOL_PROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zpool_prop_readonly(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, &strval, &intval, errbuf) != 0) goto error; /* * Perform additional checking for specific properties. */ switch (prop) { case ZPOOL_PROP_VERSION: if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %d is invalid."), propname, intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } break; case ZPOOL_PROP_BOOTSIZE: if (!flags.create) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_BOOTFS: if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (version < SPA_VERSION_BOOTFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support " "'%s' property"), propname); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. */ if (strval[0] != '\0' && !bootfs_name_valid(poolname, strval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is an invalid name"), strval); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto error; } if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad alternate root '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } break; case ZPOOL_PROP_CACHEFILE: if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be empty, an " "absolute path, or 'none'"), propname); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } slash = strrchr(strval, '/'); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid file"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '\0'; if (strval[0] != '\0' && (stat64(strval, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid directory"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '/'; break; case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment may only have printable " "characters")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (strlen(strval) > ZPROP_MAX_COMMENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment must not exceed %d characters"), ZPROP_MAX_COMMENT); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; + case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_TNAME: + if (!flags.create) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set at " + "creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s'(%d) not defined"), propname, prop); break; } } return (retprops); error: nvlist_free(retprops); return (NULL); } /* * Set zpool property : propname=propval. */ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = { 0 }; int ret = -1; char errbuf[1024]; nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_add_string(nvl, propname, propval) != 0) { nvlist_free(nvl); return (no_memory(zhp->zpool_hdl)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } nvlist_free(nvl); nvl = realprops; /* * Execute the corresponding ioctl() to set this property. */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) { nvlist_free(nvl); return (-1); } ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); zcmd_free_nvlists(&zc); nvlist_free(nvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); else (void) zpool_props_refresh(zhp); return (ret); } int zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; nvlist_t *features = NULL; zprop_list_t **last; boolean_t firstexpand = (NULL == *plp); if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) return (-1); last = plp; while (*last != NULL) last = &(*last)->pl_next; if ((*plp)->pl_all) features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { for (int i = 0; i < SPA_FEATURES; i++) { zprop_list_t *entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_INVAL; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } /* add any unsupported features */ for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; zprop_list_t *entry; if (zfeature_is_supported(nvpair_name(nvp))) continue; propname = zfs_asprintf(hdl, "unsupported@%s", nvpair_name(nvp)); /* * Before adding the property to the list make sure that no * other pool already added the same property. */ found = B_FALSE; entry = *plp; while (entry != NULL) { if (entry->pl_user_prop != NULL && strcmp(propname, entry->pl_user_prop) == 0) { found = B_TRUE; break; } entry = entry->pl_next; } if (found) { free(propname); continue; } entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_INVAL; entry->pl_user_prop = propname; entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) continue; if (entry->pl_prop != ZPROP_INVAL && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } } return (0); } /* * Get the state for the given feature on the given ZFS pool. */ int zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, size_t len) { uint64_t refcount; boolean_t found = B_FALSE; nvlist_t *features = zpool_get_features(zhp); boolean_t supported; const char *feature = strchr(propname, '@') + 1; supported = zpool_prop_feature(propname); ASSERT(supported || zpool_prop_unsupported(propname)); /* * Convert from feature name to feature guid. This conversion is * unecessary for unsupported@... properties because they already * use guids. */ if (supported) { int ret; spa_feature_t fid; ret = zfeature_lookup_name(feature, &fid); if (ret != 0) { (void) strlcpy(buf, "-", len); return (ENOTSUP); } feature = spa_feature_table[fid].fi_guid; } if (nvlist_lookup_uint64(features, feature, &refcount) == 0) found = B_TRUE; if (supported) { if (!found) { (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); } else { if (refcount == 0) (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); else (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); } } else { if (found) { if (refcount == 0) { (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); } else { (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); } } else { (void) strlcpy(buf, "-", len); return (ENOTSUP); } } return (0); } /* * Don't start the slice at the default block of 34; many storage * devices will use a stripe width of 128k, so start there instead. */ #define NEW_START_BLOCK 256 /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ boolean_t zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) { namecheck_err_t why; char what; int ret; ret = pool_namecheck(pool, &why, &what); /* * The rules for reserved pool names were extended at a later point. * But we need to support users with existing pools that may now be * invalid. So we only check for this expanded set of names during a * create (or import), and only in userland. */ if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); return (B_FALSE); } if (ret != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in pool name"), what); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name must begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool name is reserved")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (B_FALSE); } return (B_TRUE); } /* * Open a handle to the given pool, even if the pool is currently in the FAULTED * state. */ zpool_handle_t * zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; boolean_t missing; /* * Make sure the pool name is valid. */ if (!zpool_name_valid(hdl, B_TRUE, pool)) { (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); return (NULL); } if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) return (NULL); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (NULL); } if (missing) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); (void) zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); zpool_close(zhp); return (NULL); } return (zhp); } /* * Like the above, but silent on error. Used when iterating over pools (because * the configuration cache may be out of date). */ int zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) { zpool_handle_t *zhp; boolean_t missing; if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) return (-1); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (-1); } if (missing) { zpool_close(zhp); *ret = NULL; return (0); } *ret = zhp; return (0); } /* * Similar to zpool_open_canfail(), but refuses to open pools in the faulted * state. */ zpool_handle_t * zpool_open(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) return (NULL); if (zhp->zpool_state == POOL_STATE_UNAVAIL) { (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); zpool_close(zhp); return (NULL); } return (zhp); } /* * Close the handle. Simply frees the memory associated with the handle. */ void zpool_close(zpool_handle_t *zhp) { nvlist_free(zhp->zpool_config); nvlist_free(zhp->zpool_old_config); nvlist_free(zhp->zpool_props); free(zhp); } /* * Return the name of the pool. */ const char * zpool_get_name(zpool_handle_t *zhp) { return (zhp->zpool_name); } /* * Return the state of the pool (ACTIVE or UNAVAILABLE) */ int zpool_get_state(zpool_handle_t *zhp) { return (zhp->zpool_state); } /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. */ int zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *fsprops) { zfs_cmd_t zc = { 0 }; nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; char msg[1024]; int ret = -1; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot create '%s'"), pool); if (!zpool_name_valid(hdl, B_FALSE, pool)) return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); if (props) { prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; if ((zc_props = zpool_valid_proplist(hdl, pool, props, SPA_VERSION_1, flags, msg)) == NULL) { goto create_failed; } } if (fsprops) { uint64_t zoned; char *zonestr; zoned = ((nvlist_lookup_string(fsprops, zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, NULL, msg)) == NULL) { goto create_failed; } if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; } if (nvlist_add_nvlist(zc_props, ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { goto create_failed; } } if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) goto create_failed; (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); return (zfs_error(hdl, EZFS_BADDEV, msg)); case ERANGE: /* * This happens if the record size is smaller or larger * than the allowed size range, or not a power of 2. * * NOTE: although zfs_valid_proplist is called earlier, * this case may have slipped through since the * pool does not exist yet and it is therefore * impossible to read properties e.g. max blocksize * from the pool. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "record size invalid")); return (zfs_error(hdl, EZFS_BADPROP, msg)); case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is less than the " "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, msg)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, msg)); case ENOTBLK: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cache device must be a disk or disk slice")); return (zfs_error(hdl, EZFS_BADDEV, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); return (ret); } /* * Destroy the given pool. It is up to the caller to ensure that there are no * datasets left in the pool. */ int zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = { 0 }; zfs_handle_t *zfp = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; if (zhp->zpool_state == POOL_STATE_ACTIVE && (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); if (errno == EROFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, msg); } else { (void) zpool_standard_error(hdl, errno, msg); } if (zfp) zfs_close(zfp); return (-1); } if (zfp) { remove_mountpoint(zfp); zfs_close(zfp); } return (0); } /* * Create a checkpoint in the given pool. */ int zpool_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; int error; error = lzc_pool_checkpoint(zhp->zpool_name); if (error != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot checkpoint '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, msg); return (-1); } return (0); } /* * Discard the checkpoint from the given pool. */ int zpool_discard_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; int error; error = lzc_pool_checkpoint_discard(zhp->zpool_name); if (error != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot discard checkpoint in '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, msg); return (-1); } return (0); } /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. */ int zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) { zfs_cmd_t zc = { 0 }; int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add hot spares")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add cache devices")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; a pool with removing/removed " "vdevs does not support adding raidz vdevs")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EOVERFLOW: /* * This occurrs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is less than the minimum " "size (%s)"), buf); } (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); (void) zfs_error(hdl, EZFS_BADVERSION, msg); break; case EDOM: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "root pool can not have multiple vdevs" " or separate logs")); (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); break; case ENOTBLK: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cache device must be a disk or disk slice")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } ret = -1; } else { ret = 0; } zcmd_free_nvlists(&zc); return (ret); } /* * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ static int zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = { 0 }; char msg[1024]; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { case EXDEV: zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "use '-f' to override the following errors:\n" "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, msg)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, msg)); } } return (0); } int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, nvlist_t *config) { nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr || config == NULL) return; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, 0, &t) != 0) { if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " "to its state as of %s.\n"), name, timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "Pool %s returned to its state as of %s.\n"), name, timestr); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", (loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } } } void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr) return; if (reason >= 0) (void) printf(dgettext(TEXT_DOMAIN, "action: ")); else (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, 0, &t) != 0) { (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "\tReverting the pool to an earlier state " "should correct the problem.\n\t")); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld minutes of data\n" "\tmust be discarded, irreversibly. "), (loss + 30) / 60); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld seconds of data\n" "\tmust be discarded, irreversibly. "), loss); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, at least\n" "\tone persistent user-data error will remain. ")); } else { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, several\n" "\tpersistent user-data errors will remain. ")); } } (void) printf(dgettext(TEXT_DOMAIN, "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), reason >= 0 ? "clear" : "import", name); (void) printf(dgettext(TEXT_DOMAIN, "A scrub of the pool\n" "\tis strongly recommended after recovery.\n")); return; no_info: (void) printf(dgettext(TEXT_DOMAIN, "Destroy and re-create the pool from\n\ta backup source.\n")); } /* * zpool_import() is a contracted interface. Should be kept the same * if possible. * * Applications should use zpool_import_props() to import a pool with * new properties value to be set. */ int zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, char *altroot) { nvlist_t *props = NULL; int ret; if (altroot != NULL) { if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } if (nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { nvlist_free(props); return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } } ret = zpool_import_props(hdl, config, newname, props, ZFS_IMPORT_NORMAL); nvlist_free(props); return (ret); } static void print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; char *vname; uint64_t is_log = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (name != NULL) (void) printf("\t%*s%s%s\n", indent, "", name, is_log ? " [log]" : ""); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } } void zpool_print_unsup_feat(nvlist_t *config) { nvlist_t *nvinfo, *unsup_feat; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0); verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, &unsup_feat) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; nvp = nvlist_next_nvpair(unsup_feat, nvp)) { char *desc; verify(nvpair_type(nvp) == DATA_TYPE_STRING); verify(nvpair_value_string(nvp, &desc) == 0); if (strlen(desc) > 0) (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); else (void) printf("\t%s\n", nvpair_name(nvp)); } } /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from * zpool_find_import(). The 'newname' parameters control whether the pool * is imported with a different name. */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, int flags) { zfs_cmd_t zc = { 0 }; zpool_load_policy_t policy; nvlist_t *nv = NULL; nvlist_t *nvinfo = NULL; nvlist_t *missing = NULL; char *thename; char *origname; int ret; int error = 0; char errbuf[1024]; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &origname) == 0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot import pool '%s'"), origname); if (newname != NULL) { if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); thename = (char *)newname; } else { thename = origname; } if (props != NULL) { uint64_t version; prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if ((props = zpool_valid_proplist(hdl, origname, props, version, flags, errbuf)) == NULL) return (-1); if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { nvlist_free(props); return (-1); } nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &zc.zc_guid) == 0); if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { zcmd_free_nvlists(&zc); return (-1); } if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { zcmd_free_nvlists(&zc); return (-1); } zc.zc_cookie = flags; while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } if (ret != 0) error = errno; (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); zcmd_free_nvlists(&zc); zpool_get_load_policy(config, &policy); if (error) { char desc[1024]; /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, B_TRUE, nv); nvlist_free(nv); return (-1); } if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), thename); else (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); switch (error) { case ENOTSUP: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { (void) printf(dgettext(TEXT_DOMAIN, "This " "pool uses the following feature(s) not " "supported by this system:\n")); zpool_print_unsup_feat(nv); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) { (void) printf(dgettext(TEXT_DOMAIN, "All unsupported features are only " "required for writing to the pool." "\nThe pool can be imported using " "'-o readonly=on'.\n")); } } /* * Unsupported version. */ (void) zfs_error(hdl, EZFS_BADVERSION, desc); break; case EINVAL: (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; case EROFS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENXIO: if (nv && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, "The devices below are missing or " "corrupted, use '-m' to import the pool " "anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } (void) zpool_standard_error(hdl, error, desc); break; case EEXIST: (void) zpool_standard_error(hdl, error, desc); break; case ENAMETOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new name of at least one dataset is longer than " "the maximum allowable length")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); break; default: (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, newname ? origname : thename, -error, nv); break; } nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; /* * This should never fail, but play it safe anyway. */ if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; else if (zhp != NULL) zpool_close(zhp); if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); } nvlist_free(nv); return (0); } return (ret); } /* * Scan the pool. */ int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { zfs_cmd_t zc = { 0 }; char msg[1024]; int err; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = func; zc.zc_flags = cmd; if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) return (0); err = errno; /* ECANCELED on a scrub means we resumed a paused scrub */ if (err == ECANCELED && func == POOL_SCAN_SCRUB && cmd == POOL_SCRUB_NORMAL) return (0); if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) return (0); if (func == POOL_SCAN_SCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), zc.zc_name); } else { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); } } else if (func == POOL_SCAN_NONE) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), zc.zc_name); } else { assert(!"unexpected result"); } if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; uint_t psc; verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB) { if (cmd == POOL_SCRUB_PAUSE) return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); else return (zfs_error(hdl, EZFS_SCRUBBING, msg)); } else { return (zfs_error(hdl, EZFS_RESILVERING, msg)); } } else if (err == ENOENT) { return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); } else { return (zpool_standard_error(hdl, err, msg)); } } static int xlate_init_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_INITIALIZING); case ESRCH: return (EZFS_NO_INITIALIZE); } return (err); } /* * Begin, suspend, or cancel the initialization (initializing of all free * blocks) for the given vdevs in the given pool. */ int zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *errlist; /* translate vdev names to guids */ nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); boolean_t spare, cache; nvlist_t *tgt; nvpair_t *elem; for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { char *vd_path = nvpair_name(elem); tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); if ((tgt == NULL) || cache || spare) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot initialize '%s'"), vd_path); int err = (tgt == NULL) ? EZFS_NODEVICE : (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); return (zfs_error(hdl, err, msg)); } uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); fnvlist_add_uint64(vdev_guids, vd_path, guid); (void) snprintf(msg, sizeof (msg), "%llu", guid); fnvlist_add_string(guids_to_paths, msg, vd_path); } int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, &errlist); fnvlist_free(vdev_guids); if (err == 0) { fnvlist_free(guids_to_paths); return (0); } nvlist_t *vd_errlist = NULL; if (errlist != NULL) { vd_errlist = fnvlist_lookup_nvlist(errlist, ZPOOL_INITIALIZE_VDEVS); } (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "operation failed")); for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); char *path = fnvlist_lookup_string(guids_to_paths, nvpair_name(elem)); (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'", path); } fnvlist_free(guids_to_paths); if (vd_errlist != NULL) return (-1); return (zpool_standard_error(hdl, err, msg)); } /* * This provides a very minimal check whether a given string is likely a * c#t#d# style string. Users of this are expected to do their own * verification of the s# part. */ #define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1])) /* * More elaborate version for ones which may start with "/dev/dsk/" * and the like. */ static int ctd_check_path(char *str) { /* * If it starts with a slash, check the last component. */ if (str && str[0] == '/') { char *tmp = strrchr(str, '/'); /* * If it ends in "/old", check the second-to-last * component of the string instead. */ if (tmp != str && strcmp(tmp, "/old") == 0) { for (tmp--; *tmp != '/'; tmp--) ; } str = tmp + 1; } return (CTD_CHECK(str)); } /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; nvlist_t *ret; uint64_t is_log; char *srchkey; nvpair_t *pair = nvlist_next_nvpair(search, NULL); /* Nothing to look for */ if (search == NULL || pair == NULL) return (NULL); /* Obtain the key we will use to search */ srchkey = nvpair_name(pair); switch (nvpair_type(pair)) { case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { uint64_t srchval, theguid; verify(nvpair_value_uint64(pair, &srchval) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0); if (theguid == srchval) return (nv); } break; case DATA_TYPE_STRING: { char *srchval, *val; verify(nvpair_value_string(pair, &srchval) == 0); if (nvlist_lookup_string(nv, srchkey, &val) != 0) break; /* * Search for the requested value. Special cases: * * - ZPOOL_CONFIG_PATH for whole disk entries. To support * UEFI boot, these end in "s0" or "s0/old" or "s1" or * "s1/old". The "s0" or "s1" part is hidden from the user, * but included in the string, so this matches around it. * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). * * Otherwise, all other searches are simple string compares. */ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && ctd_check_path(val)) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (wholedisk) { int slen = strlen(srchval); int vlen = strlen(val); if (slen != vlen - 2) break; /* * make_leaf_vdev() should only set * wholedisk for ZPOOL_CONFIG_PATHs which * will include "/dev/dsk/", giving plenty of * room for the indices used next. */ ASSERT(vlen >= 6); /* * strings identical except trailing "s0" */ if ((strcmp(&val[vlen - 2], "s0") == 0 || strcmp(&val[vlen - 2], "s1") == 0) && strncmp(srchval, val, slen) == 0) return (nv); /* * strings identical except trailing "s0/old" */ if ((strcmp(&val[vlen - 6], "s0/old") == 0 || strcmp(&val[vlen - 6], "s1/old") == 0) && strcmp(&srchval[slen - 4], "/old") == 0 && strncmp(srchval, val, slen - 4) == 0) return (nv); break; } } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { char *type, *idx, *end, *p; uint64_t id, vdev_id; /* * Determine our vdev type, keeping in mind * that the srchval is composed of a type and * vdev id pair (i.e. mirror-4). */ if ((type = strdup(srchval)) == NULL) return (NULL); if ((p = strrchr(type, '-')) == NULL) { free(type); break; } idx = p + 1; *p = '\0'; /* * If the types don't match then keep looking. */ if (strncmp(val, type, strlen(val)) != 0) { free(type); break; } verify(zpool_vdev_is_interior(type)); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); errno = 0; vdev_id = strtoull(idx, &end, 10); free(type); if (errno != 0) return (NULL); /* * Now verify that we have the correct vdev id. */ if (vdev_id == id) return (nv); } /* * Common case */ if (strcmp(srchval, val) == 0) return (nv); break; } default: break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel * vdev, not the leaf vdevs. So we always lookup the * log device from the root of the vdev tree (where * 'log' is non-NULL). */ if (log != NULL && nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && is_log) { *log = B_TRUE; } return (ret); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); } } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); } } } return (NULL); } /* * Given a physical path (minus the "/devices" prefix), find the * associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); nvlist_free(search); return (ret); } /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ static boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); return (B_FALSE); } nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char buf[MAXPATHLEN]; char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); guid = strtoull(path, &end, 10); if (guid != 0 && *end == '\0') { verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); } else if (zpool_vdev_is_interior(path)) { verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); } else if (path[0] != '/') { (void) snprintf(buf, sizeof (buf), "%s/%s", ZFS_DISK_ROOT, path); verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); nvlist_free(search); return (ret); } static int vdev_online(nvlist_t *nv) { uint64_t ival; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) return (0); return (1); } /* * Helper function for zpool_get_physpaths(). */ static int vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, size_t *bytes_written) { size_t bytes_left, pos, rsz; char *tmppath; const char *format; if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0) return (EZFS_NODEVICE); pos = *bytes_written; bytes_left = physpath_size - pos; format = (pos == 0) ? "%s" : " %s"; rsz = snprintf(physpath + pos, bytes_left, format, tmppath); *bytes_written += rsz; if (rsz >= bytes_left) { /* if physpath was not copied properly, clear it */ if (bytes_left != 0) { physpath[pos] = 0; } return (EZFS_NOSPC); } return (0); } static int vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, size_t *rsz, boolean_t is_spare) { char *type; int ret; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (EZFS_INVALCONFIG); if (strcmp(type, VDEV_TYPE_DISK) == 0) { /* * An active spare device has ZPOOL_CONFIG_IS_SPARE set. * For a spare vdev, we only want to boot from the active * spare device. */ if (is_spare) { uint64_t spare = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare); if (!spare) return (EZFS_INVALCONFIG); } if (vdev_online(nv)) { if ((ret = vdev_get_one_physpath(nv, physpath, phypath_size, rsz)) != 0) return (ret); } } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || strcmp(type, VDEV_TYPE_RAIDZ) == 0 || strcmp(type, VDEV_TYPE_REPLACING) == 0 || (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { nvlist_t **child; uint_t count; int i, ret; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) return (EZFS_INVALCONFIG); for (i = 0; i < count; i++) { ret = vdev_get_physpaths(child[i], physpath, phypath_size, rsz, is_spare); if (ret == EZFS_NOSPC) return (ret); } } return (EZFS_POOL_INVALARG); } /* * Get phys_path for a root pool config. * Return 0 on success; non-zero on failure. */ static int zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) { size_t rsz; nvlist_t *vdev_root; nvlist_t **child; uint_t count; char *type; rsz = 0; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_root) != 0) return (EZFS_INVALCONFIG); if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) return (EZFS_INVALCONFIG); /* * root pool can only have a single top-level vdev. */ if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1) return (EZFS_POOL_INVALARG); (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, B_FALSE); /* No online devices */ if (rsz == 0) return (EZFS_NODEVICE); return (0); } /* * Get phys_path for a root pool * Return 0 on success; non-zero on failure. */ int zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) { return (zpool_get_config_physpath(zhp->zpool_config, physpath, phypath_size)); } /* * If the device has being dynamically expanded then we need to relabel * the disk to use the new unallocated space. */ static int zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) { char path[MAXPATHLEN]; char errbuf[1024]; int fd, error; int (*_efi_use_whole_disk)(int); if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, "efi_use_whole_disk")) == NULL) return (-1); (void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " "relabel '%s': unable to open device"), name); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } /* * It's possible that we might encounter an error if the device * does not have any unallocated space left. If so, we simply * ignore that error and continue on. */ error = _efi_use_whole_disk(fd); (void) close(fd); if (error && error != VT_ENOSPC) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " "relabel '%s': unable to read disk capacity"), name); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } return (0); } /* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ int zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, vdev_state_t *newstate) { zfs_cmd_t zc = { 0 }; char msg[1024]; char *pathname; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; if (flags & ZFS_ONLINE_EXPAND) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot expand %s"), path); } else { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot online %s"), path); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if ((flags & ZFS_ONLINE_EXPAND || zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * XXX - L2ARC 1.0 devices can't support expansion. */ if (l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot expand cache devices")); return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); } if (wholedisk) { pathname += strlen(ZFS_DISK_ROOT) + 1; (void) zpool_relabel_disk(hdl, pathname); } } zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " "instead"), "zpool detach"); return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); } return (zpool_standard_error(hdl, errno, msg)); } *newstate = zc.zc_cookie; return (0); } /* * Take the specified vdev offline */ int zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); case EEXIST: /* * The log device has unplayed logs */ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } /* * Mark the given vdev faulted. */ int zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } /* * Mark the given vdev degraded. */ int zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_DEGRADED; zc.zc_obj = aux; if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. */ static boolean_t is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) { nvlist_t **child; uint_t c, children; char *type; if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_SPARE) == 0 && children == 2 && child[which] == tgt) return (B_TRUE); for (c = 0; c < children; c++) if (is_replacing_spare(child[c], tgt, which)) return (B_TRUE); } return (B_FALSE); } /* * Attach new_disk (fully described by nvroot) to old_disk. * If 'replacing' is specified, the new disk will replace the old one. */ int zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) { zfs_cmd_t zc = { 0 }; char msg[1024]; int ret; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; boolean_t rootpool = zpool_is_bootable(zhp); if (replacing) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot replace %s with %s"), old_disk, new_disk); else (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); return (zfs_error(hdl, EZFS_INVALCONFIG, msg)); } verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL) return (-1); /* * If the target is a hot spare that has been swapped in, we can only * replace it with another hot spare. */ if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); free(newname); return (zfs_error(hdl, EZFS_BADTARGET, msg)); } free(newname); if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) { if (rootpool) { /* * XXX need a better way to prevent user from * booting up a half-baked vdev. */ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " "sure to wait until resilver is done " "before rebooting.\n")); } return (0); } switch (errno) { case ENOTSUP: /* * Can't attach to or replace this type of vdev. */ if (replacing) { uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); else if (version >= SPA_VERSION_MULTI_REPLACE) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " "disks")); } (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; case EINVAL: /* * The new device must be a single disk. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " "or device removal is in progress"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EOVERFLOW: /* * The new device is too small. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is too small")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EDOM: /* * The new device has a different alignment requirement. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "devices have different sector alignment")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. */ (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } return (-1); } /* * Detach the specified device. */ int zpool_vdev_detach(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't detach from this type of vdev. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; case EBUSY: /* * There are no other replicas of this device. */ (void) zfs_error(hdl, EZFS_NOREPLICAS, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } return (-1); } /* * Find a mirror vdev in the source nvlist. * * The mchild array contains a list of disks in one of the top-level mirrors * of the source pool. The schild array contains a list of disks that the * user specified on the command line. We loop over the mchild array to * see if any entry in the schild array matches. * * If a disk in the mchild array is found in the schild array, we return * the index of that entry. Otherwise we return -1. */ static int find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, nvlist_t **schild, uint_t schildren) { uint_t mc; for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, mchild[mc], B_FALSE); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, schild[sc], B_FALSE); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); if (result) { free(mpath); return (mc); } } free(mpath); } return (-1); } /* * Split a mirror pool. If newroot points to null, then a new nvlist * is generated and it is the responsibility of the caller to free it. */ int zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t vers; boolean_t freelist = B_FALSE, memory_err = B_TRUE; int retval = 0; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("Internal error: unable to " "retrieve pool configuration\n")); return (-1); } verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); if (props) { prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, props, vers, flags, msg)) == NULL) return (-1); } if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); nvlist_free(zc_props); return (-1); } varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); vcount = 0; if (*newroot == NULL || nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, &newchild, &newchildren) != 0) newchildren = 0; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *type; nvlist_t **mchild, *vdev; uint_t mchildren; int entry; /* * Unlike cache & spares, slogs are stored in the * ZPOOL_CONFIG_CHILDREN array. We filter them out here. */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_log || is_hole) { /* * Create a hole vdev and put it in the config. */ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0) goto out; if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, 1) != 0) goto out; if (lastlog == 0) lastlog = vcount; varray[vcount++] = vdev; continue; } lastlog = 0; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); goto out; } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); /* find or add an entry for this top-level vdev */ if (newchildren > 0 && (entry = find_vdev_entry(zhp, mchild, mchildren, newchild, newchildren)) >= 0) { /* We found a disk that the user specified. */ vdev = mchild[entry]; ++found; } else { /* User didn't specify a disk for this vdev. */ vdev = mchild[mchildren - 1]; } if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; } /* did we find every disk the user specified? */ if (found != newchildren) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " "include at most one disk from each mirror")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); goto out; } /* Prepare the nvlist for populating. */ if (*newroot == NULL) { if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) goto out; freelist = B_TRUE; if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0) goto out; } else { verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); } /* Add all the children we found */ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, lastlog == 0 ? vcount : lastlog) != 0) goto out; /* * If we're just doing a dry run, exit now with success. */ if (flags.dryrun) { memory_err = B_FALSE; freelist = B_FALSE; goto out; } /* now build up the config list & call the ioctl */ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || nvlist_add_string(newconfig, ZPOOL_CONFIG_POOL_NAME, newname) != 0 || nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) goto out; /* * The new pool is automatically part of the namespace unless we * explicitly export it. */ if (!flags.import) zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) goto out; if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) goto out; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { retval = zpool_standard_error(hdl, errno, msg); goto out; } freelist = B_FALSE; memory_err = B_FALSE; out: if (varray != NULL) { int v; for (v = 0; v < vcount; v++) nvlist_free(varray[v]); free(varray); } zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; } if (retval != 0) return (retval); if (memory_err) return (no_memory(hdl)); return (0); } /* * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t version; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "root pool can not have removed devices, " "because GRUB does not understand them")); return (zfs_error(hdl, EINVAL, msg)); } zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); switch (errno) { case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " "have the same sector size and not be raidz.")); (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Pool busy; removal may already be in progress")); (void) zfs_error(hdl, EZFS_BUSY, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } return (-1); } int zpool_vdev_remove_cancel(zpool_handle_t *zhp) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot cancel removal")); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } int zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, uint64_t *sizep) { char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), path); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare || l2cache || islog) { *sizep = 0; return (0); } if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "indirect size not available")); return (zfs_error(hdl, EINVAL, msg)); } return (0); } /* * Clear the errors for the pool, or the particular device if specified. */ int zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; zpool_load_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; int error; if (path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), path); else (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); /* * Don't allow error clearing for hot spares. Do allow * error clearing for l2cache devices. */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); } zpool_get_load_policy(rewindnvl, &policy); zc.zc_cookie = policy.zlp_rewind; if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) return (-1); if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) return (-1); while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); zpool_rewind_exclaim(hdl, zc.zc_name, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nvi); nvlist_free(nvi); } zcmd_free_nvlists(&zc); return (0); } zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, msg)); } /* * Similar to zpool_clear(), but takes a GUID (used by fmd). */ int zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Change the GUID for a pool. */ int zpool_reguid(zpool_handle_t *zhp) { char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_cmd_t zc = { 0 }; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Reopen the pool. */ int zpool_reopen(zpool_handle_t *zhp) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Convert from a devid string to a path. */ static char * devid_to_path(char *devid_str) { ddi_devid_t devid; char *minor; char *path; devid_nmlist_t *list = NULL; int ret; if (devid_str_decode(devid_str, &devid, &minor) != 0) return (NULL); ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list); devid_str_free(minor); devid_free(devid); if (ret != 0) return (NULL); /* * In a case the strdup() fails, we will just return NULL below. */ path = strdup(list[0].devname); devid_free_nmlist(list); return (path); } /* * Convert from a path to a devid string. */ static char * path_to_devid(const char *path) { int fd; ddi_devid_t devid; char *minor, *ret; if ((fd = open(path, O_RDONLY)) < 0) return (NULL); minor = NULL; ret = NULL; if (devid_get(fd, &devid) == 0) { if (devid_get_minor_name(fd, &minor) == 0) ret = devid_str_encode(devid, minor); if (minor != NULL) devid_str_free(minor); devid_free(devid); } (void) close(fd); return (ret); } /* * Issue the necessary ioctl() to update the stored path value for the vdev. We * ignore any failure here, since a common case is for an unprivileged user to * type 'zpool status', and we'll display the correct information anyway. */ static void set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) { zfs_cmd_t zc = { 0 }; (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value)); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); } /* * Given a vdev, return the name to display in iostat. If the vdev has a path, * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. * We also check if this is a whole disk, in which case we strip off the * trailing 's0' slice name. * * This routine is also responsible for identifying when disks have been * reconfigured in a new location. The kernel will have opened the device by * devid, but the path will still refer to the old location. To catch this, we * first do a path -> devid translation (which is fast for the common case). If * the devid matches, we're done. If not, we do a reverse devid -> path * translation and issue the appropriate ioctl() to update the path of the vdev. * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any * of these checks. */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, boolean_t verbose) { char *path, *devid; uint64_t value; char buf[64]; vdev_stat_t *vs; uint_t vsc; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { /* * If the device is dead (faulted, offline, etc) then don't * bother opening it. Otherwise we may be forcing the user to * open a misbehaving device, which can have undesirable * effects. */ if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) != 0 || vs->vs_state >= VDEV_STATE_DEGRADED) && zhp != NULL && nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { /* * Determine if the current path is correct. */ char *newdevid = path_to_devid(path); if (newdevid == NULL || strcmp(devid, newdevid) != 0) { char *newpath; if ((newpath = devid_to_path(devid)) != NULL) { /* * Update the path appropriately. */ set_path(zhp, nv, newpath); if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, newpath) == 0) verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); free(newpath); } } if (newdevid) devid_str_free(newdevid); } if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) path += strlen(ZFS_DISK_ROOTD); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value) { int pathlen = strlen(path); char *tmp = zfs_strdup(hdl, path); /* * If it starts with c#, and ends with "s0" or "s1", * chop the slice off, or if it ends with "s0/old" or * "s1/old", remove the slice from the middle. */ if (CTD_CHECK(tmp)) { if (strcmp(&tmp[pathlen - 2], "s0") == 0 || strcmp(&tmp[pathlen - 2], "s1") == 0) { tmp[pathlen - 2] = '\0'; } else if (pathlen > 6 && (strcmp(&tmp[pathlen - 6], "s0/old") == 0 || strcmp(&tmp[pathlen - 6], "s1/old") == 0)) { (void) strcpy(&tmp[pathlen - 6], "/old"); } } return (tmp); } } else { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); /* * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); path = buf; } /* * We identify each top-level vdev by using a * naming convention. */ if (verbose) { uint64_t id; verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); (void) snprintf(buf, sizeof (buf), "%s-%llu", path, (u_longlong_t)id); path = buf; } } return (zfs_strdup(hdl, path)); } static int zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. */ int zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = { 0 }; uint64_t count; zbookmark_phys_t *zb = NULL; int i; /* * Retrieve the raw error list from the kernel. If the number of errors * has increased, allocate more space and continue until we get the * entire list. */ verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT, &count) == 0); if (count == 0) return (0); if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL) return (-1); zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG, &zc) != 0) { free((void *)(uintptr_t)zc.zc_nvlist_dst); if (errno == ENOMEM) { void *dst; count = zc.zc_nvlist_dst_size; dst = zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t)); if (dst == NULL) return (-1); zc.zc_nvlist_dst = (uintptr_t)dst; } else { return (-1); } } else { break; } } /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size; qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ for (i = 0; i < count; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && zb[i-1].zb_object == zb[i].zb_object) continue; if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) goto nomem; if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb[i].zb_objset) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb[i].zb_object) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { nvlist_free(nv); goto nomem; } nvlist_free(nv); } free((void *)(uintptr_t)zc.zc_nvlist_dst); return (0); nomem: free((void *)(uintptr_t)zc.zc_nvlist_dst); return (no_memory(zhp->zpool_hdl)); } /* * Upgrade a ZFS pool to the latest on-disk version. */ int zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strcpy(zc.zc_name, zhp->zpool_name); zc.zc_cookie = new_version; if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), zhp->zpool_name)); return (0); } void zfs_save_arguments(int argc, char **argv, char *string, int len) { (void) strlcpy(string, basename(argv[0]), len); for (int i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); } } int zpool_log_history(libzfs_handle_t *hdl, const char *message) { zfs_cmd_t zc = { 0 }; nvlist_t *args; int err; args = fnvlist_alloc(); fnvlist_add_string(args, "message", message); err = zcmd_write_src_nvlist(hdl, &zc, args); if (err == 0) err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); } /* * Perform ioctl to get some command history of a pool. * * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the * logical offset of the history buffer to start reading from. * * Upon return, 'off' is the next logical offset to read from and * 'len' is the actual amount of bytes read into 'buf'. */ static int get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)buf; zc.zc_history_len = *len; zc.zc_history_offset = *off; if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, dgettext(TEXT_DOMAIN, "cannot show history for pool '%s'"), zhp->zpool_name)); case ENOENT: return (zfs_error_fmt(hdl, EZFS_NOHISTORY, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s'"), zhp->zpool_name)); case ENOTSUP: return (zfs_error_fmt(hdl, EZFS_BADVERSION, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s', pool must be upgraded"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name)); } } *len = zc.zc_history_len; *off = zc.zc_history_offset; return (0); } /* * Process the buffer of nvlists, unpacking and storing each nvlist record * into 'records'. 'leftover' is set to the number of bytes that weren't * processed as there wasn't a complete record. */ int zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, nvlist_t ***records, uint_t *numrecords) { uint64_t reclen; nvlist_t *nv; int i; while (bytes_read > sizeof (reclen)) { /* get length of packed record (stored as little endian) */ for (i = 0, reclen = 0; i < sizeof (reclen); i++) reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); if (bytes_read < sizeof (reclen) + reclen) break; /* unpack record */ if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) return (ENOMEM); bytes_read -= sizeof (reclen) + reclen; buf += sizeof (reclen) + reclen; /* add record to nvlist array */ (*numrecords)++; if (ISP2(*numrecords + 1)) { *records = realloc(*records, *numrecords * 2 * sizeof (nvlist_t *)); } (*records)[*numrecords - 1] = nv; } *leftover = bytes_read; return (0); } /* * Retrieve the command history of a pool. */ int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) { char *buf; int buflen = 128 * 1024; uint64_t off = 0; nvlist_t **records = NULL; uint_t numrecords = 0; int err, i; buf = malloc(buflen); if (buf == NULL) return (ENOMEM); do { uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ if (!bytes_read) break; if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) break; off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough * to hold this record; resize and retry. */ buflen *= 2; free(buf); buf = malloc(buflen); if (buf == NULL) return (ENOMEM); } /* CONSTCOND */ } while (1); free(buf); if (!err) { verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, records, numrecords) == 0); } for (i = 0; i < numrecords; i++) nvlist_free(records[i]); free(records); return (err); } void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zfs_cmd_t zc = { 0 }; boolean_t mounted = B_FALSE; char *mntpnt = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; if (dsobj == 0) { /* special case for the MOS */ (void) snprintf(pathname, len, ":<0x%llx>", obj); return; } /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", dsobj, obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, zc.zc_value); } else { (void) snprintf(pathname, len, "%s:%s", dsname, zc.zc_value); } } else { (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj); } free(mntpnt); } /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL * diskaddr argument then we set it to the starting address of the EFI * partition. If the caller has passed a non-NULL boolean argument, then * we set it to indicate if the disk does have efi system partition. */ static int read_efi_label(nvlist_t *config, diskaddr_t *sb, boolean_t *system) { char *path; int fd; char diskname[MAXPATHLEN]; boolean_t boot = B_FALSE; int err = -1; int slice; if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) return (err); (void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT, strrchr(path, '/')); if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { struct dk_gpt *vtoc; if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { for (slice = 0; slice < vtoc->efi_nparts; slice++) { if (vtoc->efi_parts[slice].p_tag == V_SYSTEM) boot = B_TRUE; if (vtoc->efi_parts[slice].p_tag == V_USR) break; } if (sb != NULL && vtoc->efi_parts[slice].p_tag == V_USR) *sb = vtoc->efi_parts[slice].p_start; if (system != NULL) *system = boot; efi_free(vtoc); } (void) close(fd); } return (err); } /* * determine where a partition starts on a disk in the current * configuration */ static diskaddr_t find_start_block(nvlist_t *config) { nvlist_t **child; uint_t c, children; diskaddr_t sb = MAXOFFSET_T; uint64_t wholedisk; if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk) != 0 || !wholedisk) { return (MAXOFFSET_T); } if (read_efi_label(config, &sb, NULL) < 0) sb = MAXOFFSET_T; return (sb); } for (c = 0; c < children; c++) { sb = find_start_block(child[c]); if (sb != MAXOFFSET_T) { return (sb); } } return (MAXOFFSET_T); } /* * Label an individual disk. The name provided is the short name, * stripped of any leading /dev path. */ int zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name, zpool_boot_label_t boot_type, uint64_t boot_size, int *slice) { char path[MAXPATHLEN]; struct dk_gpt *vtoc; int fd; size_t resv = EFI_MIN_RESV_SIZE; uint64_t slice_size; diskaddr_t start_block; char errbuf[1024]; /* prepare an error message just in case */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); if (zhp) { nvlist_t *nvroot; verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (zhp->zpool_start_block == 0) start_block = find_start_block(nvroot); else start_block = zhp->zpool_start_block; zhp->zpool_start_block = start_block; } else { /* new pool */ start_block = NEW_START_BLOCK; } (void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name, BACKUP_SLICE); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { /* * This shouldn't happen. We've long since verified that this * is a valid device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to open device")); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { /* * The only way this can fail is if we run out of memory, or we * were unable to read the disk's capacity */ if (errno == ENOMEM) (void) no_memory(hdl); (void) close(fd); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to read disk capacity"), name); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } /* * Why we use V_USR: V_BACKUP confuses users, and is considered * disposable by some EFI utilities (since EFI doesn't have a backup * slice). V_UNASSIGNED is supposed to be used only for zero size * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, * etc. were all pretty specific. V_USR is as close to reality as we * can get, in the absence of V_OTHER. */ /* first fix the partition start block */ if (start_block == MAXOFFSET_T) start_block = NEW_START_BLOCK; /* * EFI System partition is using slice 0. * ZFS is on slice 1 and slice 8 is reserved. * We assume the GPT partition table without system * partition has zfs p_start == NEW_START_BLOCK. * If start_block != NEW_START_BLOCK, it means we have * system partition. Correct solution would be to query/cache vtoc * from existing vdev member. */ if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { if (boot_size % vtoc->efi_lbasize != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "boot partition size must be a multiple of %d"), vtoc->efi_lbasize); (void) close(fd); efi_free(vtoc); return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); } /* * System partition size checks. * Note the 1MB is quite arbitrary value, since we * are creating dedicated pool, it should be enough * to hold fat + efi bootloader. May need to be * adjusted if the bootloader size will grow. */ if (boot_size < 1024 * 1024) { char buf[64]; zfs_nicenum(boot_size, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Specified size %s for EFI System partition is too " "small, the minimum size is 1MB."), buf); (void) close(fd); efi_free(vtoc); return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); } /* 33MB is tested with mkfs -F pcfs */ if (hdl->libzfs_printerr && ((vtoc->efi_lbasize == 512 && boot_size < 33 * 1024 * 1024) || (vtoc->efi_lbasize == 4096 && boot_size < 256 * 1024 * 1024))) { char buf[64]; zfs_nicenum(boot_size, buf, sizeof (buf)); (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: EFI System partition size %s is " "not allowing to create FAT32 file\nsystem, which " "may result in unbootable system.\n"), buf); } /* Adjust zfs partition start by size of system partition. */ start_block += boot_size / vtoc->efi_lbasize; } if (start_block == NEW_START_BLOCK) { /* * Use default layout. * ZFS is on slice 0 and slice 8 is reserved. */ slice_size = vtoc->efi_last_u_lba + 1; slice_size -= EFI_MIN_RESV_SIZE; slice_size -= start_block; if (slice != NULL) *slice = 0; vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; vtoc->efi_parts[0].p_tag = V_USR; (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); vtoc->efi_parts[8].p_start = slice_size + start_block; vtoc->efi_parts[8].p_size = resv; vtoc->efi_parts[8].p_tag = V_RESERVED; } else { slice_size = start_block - NEW_START_BLOCK; vtoc->efi_parts[0].p_start = NEW_START_BLOCK; vtoc->efi_parts[0].p_size = slice_size; vtoc->efi_parts[0].p_tag = V_SYSTEM; (void) strcpy(vtoc->efi_parts[0].p_name, "loader"); if (slice != NULL) *slice = 1; /* prepare slice 1 */ slice_size = vtoc->efi_last_u_lba + 1 - slice_size; slice_size -= resv; slice_size -= NEW_START_BLOCK; vtoc->efi_parts[1].p_start = start_block; vtoc->efi_parts[1].p_size = slice_size; vtoc->efi_parts[1].p_tag = V_USR; (void) strcpy(vtoc->efi_parts[1].p_name, "zfs"); vtoc->efi_parts[8].p_start = slice_size + start_block; vtoc->efi_parts[8].p_size = resv; vtoc->efi_parts[8].p_tag = V_RESERVED; } if (efi_write(fd, vtoc) != 0) { /* * Some block drivers (like pcata) may not support EFI * GPT labels. Print out a helpful error message dir- * ecting the user to manually label the disk and give * a specific slice. */ (void) close(fd); efi_free(vtoc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using fdisk(1M) and then provide a specific slice")); return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); } (void) close(fd); efi_free(vtoc); return (0); } static boolean_t supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) { char *type; nvlist_t **child; uint_t children, c; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_FILE) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_MISSING) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "vdev type '%s' is not supported"), type); (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); return (B_FALSE); } if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { if (!supported_dump_vdev_type(hdl, child[c], errbuf)) return (B_FALSE); } } return (B_TRUE); } /* * Check if this zvol is allowable for use as a dump device; zero if * it is, > 0 if it isn't, < 0 if it isn't a zvol. * * Allowable storage configurations include mirrors, all raidz variants, and * pools with log, cache, and spare devices. Pools which are backed by files or * have missing/hole vdevs are not suitable. */ int zvol_check_dump_config(char *arg) { zpool_handle_t *zhp = NULL; nvlist_t *config, *nvroot; char *p, *volname; nvlist_t **top; uint_t toplevels; libzfs_handle_t *hdl; char errbuf[1024]; char poolname[ZFS_MAX_DATASET_NAME_LEN]; int pathlen = strlen(ZVOL_FULL_DEV_DIR); int ret = 1; if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { return (-1); } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "dump is not supported on device '%s'"), arg); if ((hdl = libzfs_init()) == NULL) return (1); libzfs_print_on_error(hdl, B_TRUE); volname = arg + pathlen; /* check the configuration of the pool */ if ((p = strchr(volname, '/')) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "malformed dataset name")); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (1); } else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset name is too long")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); return (1); } else { (void) strncpy(poolname, volname, p - volname); poolname[p - volname] = '\0'; } if ((zhp = zpool_open(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto out; } config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not obtain vdev configuration for '%s'"), poolname); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { goto out; } ret = 0; out: if (zhp) zpool_close(zhp); libzfs_fini(hdl); return (ret); } Index: vendor/illumos/dist/man/man1m/zpool.1m =================================================================== --- vendor/illumos/dist/man/man1m/zpool.1m (revision 342531) +++ vendor/illumos/dist/man/man1m/zpool.1m (revision 342532) @@ -1,2004 +1,2030 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" .Dd August 23, 2017 .Dt ZPOOL 1M .Os .Sh NAME .Nm zpool .Nd configure ZFS storage pools .Sh SYNOPSIS .Nm .Fl \? .Nm .Cm add .Op Fl fn .Ar pool vdev Ns ... .Nm .Cm attach .Op Fl f .Ar pool device new_device .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Nm .Cm clear .Ar pool .Op Ar device .Nm .Cm create .Op Fl dfn .Op Fl B .Op Fl m Ar mountpoint .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root +.Op Fl t Ar tempname .Ar pool vdev Ns ... .Nm .Cm destroy .Op Fl f .Ar pool .Nm .Cm detach .Ar pool device .Nm .Cm export .Op Fl f .Ar pool Ns ... .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... .Ar pool Ns ... .Nm .Cm history .Op Fl il .Oo Ar pool Oc Ns ... .Nm .Cm import .Op Fl D .Op Fl d Ar dir .Nm .Cm import .Fl a .Op Fl DfmN .Op Fl F Op Fl n .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Nm .Cm import -.Op Fl Dfm +.Op Fl Dfmt .Op Fl F Op Fl n .Op Fl -rewind-to-checkpoint .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool Ns | Ns Ar id .Op Ar newpool .Nm .Cm initialize .Op Fl cs .Ar pool .Op Ar device Ns ... .Nm .Cm iostat .Op Fl v .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm .Cm labelclear .Op Fl f .Ar device .Nm .Cm list .Op Fl Hpv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm .Cm offline .Op Fl t .Ar pool Ar device Ns ... .Nm .Cm online .Op Fl e .Ar pool Ar device Ns ... .Nm .Cm reguid .Ar pool .Nm .Cm reopen .Ar pool .Nm .Cm remove .Op Fl np .Ar pool Ar device Ns ... .Nm .Cm remove .Fl s .Ar pool .Nm .Cm replace .Op Fl f .Ar pool Ar device Op Ar new_device .Nm .Cm scrub .Op Fl s | Fl p .Ar pool Ns ... .Nm .Cm set .Ar property Ns = Ns Ar value .Ar pool .Nm .Cm split .Op Fl n .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool .Nm .Cm status .Op Fl Dvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm .Cm upgrade .Nm .Cm upgrade .Fl v .Nm .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... .Sh DESCRIPTION The .Nm command configures ZFS storage pools. A storage pool is a collection of devices that provides physical storage and data replication for ZFS datasets. All datasets within a storage pool share the same space. See .Xr zfs 1M for information on managing datasets. .Ss Virtual Devices (vdevs) A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width Ds .It Sy disk A block device, typically located under .Pa /dev/dsk . ZFS can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name .Po the relative portion of the path under .Pa /dev/dsk .Pc . A whole disk can be specified by omitting the slice or partition designation. For example, .Pa c0t0d0 is equivalent to .Pa /dev/dsk/c0t0d0s2 . When given a whole disk, ZFS automatically labels the disk, if necessary. .It Sy file A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be specified by a full path. .It Sy mirror A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices failing before data integrity is compromised. .It Sy raidz , raidz1 , raidz2 , raidz3 A variation on RAID-5 that allows for better distribution of parity and eliminates the RAID-5 .Qq write hole .Pq in which data and parity become inconsistent after a power loss . Data and parity is striped across all disks within a raidz group. .Pp A raidz group can have single-, double-, or triple-parity, meaning that the raidz group can sustain one, two, or three failures, respectively, without losing any data. The .Sy raidz1 vdev type specifies a single-parity raidz group; the .Sy raidz2 vdev type specifies a double-parity raidz group; and the .Sy raidz3 vdev type specifies a triple-parity raidz group. The .Sy raidz vdev type is an alias for .Sy raidz1 . .Pp A raidz group with N disks of size X with P parity disks can hold approximately (N-P)*X bytes and can withstand P device(s) failing before data integrity is compromised. The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy spare A special pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the .Sx Hot Spares section. .It Sy log A separate intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, raidz vdev types are not supported for the intent log. For more information, see the .Sx Intent Log section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. .El .Pp Virtual devices cannot be nested, so a mirror or raidz virtual device can only contain files or disks. Mirrors of mirrors .Pq or other combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as .Qq root vdevs .Pc . Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, ZFS automatically places data on the newly available devices. .Pp Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords .Sy mirror and .Sy raidz are used to distinguish where a group ends and another begins. For example, the following creates two root vdevs, each a mirror of two disks: .Bd -literal # zpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0 .Ed .Ss Device Failure and Recovery ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and ZFS automatically repairs bad data from a good copy when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or raidz groups. While ZFS supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is strongly discouraged. A single case of bit corruption can render some or all of your data unavailable. .Pp A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level vdev, such as mirror or raidz device, is potentially impacted by the state of its associated vdevs, or component devices. A top-level vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" .It Sy DEGRADED One or more top-level vdevs is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. .Pp One or more component devices is in the degraded or faulted state, but sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet .It The number of checksum errors exceeds acceptable levels and the device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. The device could not be marked as faulted because there are insufficient replicas to continue functioning. .El .It Sy FAULTED One or more top-level vdevs is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. .Pp One or more component devices is in the faulted state, and insufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet .It The device could be opened, but the contents did not match expected values. .It The number of I/O errors exceeds acceptable levels and the device is faulted to prevent further use of the device. .El .It Sy OFFLINE The device was explicitly taken offline by the .Nm zpool Cm offline command. .It Sy ONLINE The device is online and functioning. .It Sy REMOVED The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .It Sy UNAVAIL The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. .El .Pp If a device is removed and later re-attached to the system, ZFS attempts to put the device online automatically. Device attach detection is hardware-dependent and might not be supported on all platforms. .Ss Hot Spares ZFS allows devices to be associated with pools as .Qq hot spares . These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Sy spare vdev with any number of devices. For example, .Bd -literal # zpool create pool mirror c0d0 c1d0 spare c2d0 c3d0 .Ed .Pp Spares can be shared across multiple pools, and can be added with the .Nm zpool Cm add command and removed with the .Nm zpool Cm remove command. Once a spare replacement is initiated, a new .Sy spare vdev is created within the configuration that will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails. .Pp If a pool has a shared spare that is currently being used, the pool can not be exported since other pools may use this shared spare, which may lead to potential data corruption. .Pp An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp Spares cannot replace log devices. .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous transactions. For instance, databases often require their transactions to be on stable storage devices when returning from a system call. NFS and other applications can also use .Xr fsync 3C to ensure data stability. By default, the intent log is allocated from blocks within the main pool. However, it might be possible to get better performance using separate intent log devices such as NVRAM or a dedicated disk. For example: .Bd -literal # zpool create pool c0d0 c1d0 log c2d0 .Ed .Pp Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp Log devices can be added, replaced, attached, detached, and imported and exported as part of the larger pool. Mirrored devices can be removed by specifying the top-level mirror vdev. .Ss Cache Devices Devices can be added to a storage pool as .Qq cache devices . These devices provide an additional layer of caching between main memory and disk. For read-heavy workloads, where the working set size is much larger than what can be cached in main memory, using cache devices allow much more of this working set to be served from low latency media. Using cache devices provides the greatest performance improvement for random read-workloads of mostly static content. .Pp To create a pool with cache devices, specify a .Sy cache vdev with any number of devices. For example: .Bd -literal # zpool create pool c0d0 c1d0 cache c2d0 c3d0 .Ed .Pp Cache devices cannot be mirrored or part of a raidz configuration. If a read error is encountered on a cache device, that read I/O is reissued to the original storage pool device, which might be part of a mirrored or raidz configuration. .Pp The content of the cache devices is considered volatile, as is the case with other system caches. .Ss Pool checkpoint Before starting critical procedures that include destructive actions (e.g .Nm zfs Cm destroy ), an administrator can checkpoint the pool's state and in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. .Pp A pool checkpoint can be thought of as a pool-wide snapshot and should be used with care as it contains every part of the pool's state, from properties to vdev configuration. Thus, while a pool has a checkpoint certain operations are not allowed. Specifically, vdev removal/attach/detach, mirror splitting, and changing the pool's guid. Adding a new vdev is supported but in the case of a rewind it will have to be added again. Finally, users of this feature should keep in mind that scrubs in a pool that has a checkpoint do not repair checkpointed data. .Pp To create a checkpoint for a pool: .Bd -literal # zpool checkpoint pool .Ed .Pp To later rewind to its checkpointed state, you need to first export it and then rewind it during import: .Bd -literal # zpool export pool # zpool import --rewind-to-checkpoint pool .Ed .Pp To discard the checkpoint from a pool: .Bd -literal # zpool checkpoint -d pool .Ed .Pp Dataset reservations (controlled by the .Nm reservation or .Nm refreservation zfs properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. .Ss Properties Each pool has several properties associated with it. Some properties are read-only statistics while others are configurable and change the behavior of the pool. .Pp The following are read-only properties: .Bl -tag -width Ds .It Sy available Amount of storage available within the pool. This property can also be referred to by its shortened column name, .Sy avail . .It Sy bootsize The size of the system boot partition. This property can only be set at pool creation time and is read-only once pool is created. Setting this property implies using the .Fl B option. .It Sy capacity Percentage of pool space used. This property can also be referred to by its shortened column name, .Sy cap . .It Sy expandsize Amount of uninitialized space within the pool or device that can be used to increase the total capacity of the pool. Uninitialized space consists of any space on an EFI labeled vdev which has not been brought online .Po e.g, using .Nm zpool Cm online Fl e .Pc . This space occurs when a LUN is dynamically expanded. .It Sy fragmentation The amount of fragmentation in the pool. .It Sy free The amount of free space available in the pool. .It Sy freeing After a file system or snapshot is destroyed, the space it was using is returned to the pool asynchronously. .Sy freeing is the amount of space remaining to be reclaimed. Over time .Sy freeing will decrease while .Sy free increases. .It Sy health The current health of the pool. Health can be one of .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . .It Sy guid A unique identifier for the pool. .It Sy size Total size of the storage pool. .It Sy unsupported@ Ns Em feature_guid Information about unsupported features that are enabled on the pool. See .Xr zpool-features 5 for details. .It Sy used Amount of storage space used within the pool. .El .Pp The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a raidz configuration depends on the characteristics of the data being written. In addition, ZFS reserves some space for internal accounting that the .Xr zfs 1M command takes into account, but the .Nm command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable. .Pp The following property can be set at creation time and import time: .Bl -tag -width Ds .It Sy altroot Alternate root directory. If set, this directory is prepended to any mount points within the pool. This can be used when examining an unknown pool where the mount points cannot be trusted, or in an alternate boot environment, where the typical paths are not valid. .Sy altroot is not a persistent property. It is valid only while the system is up. Setting .Sy altroot defaults to using .Sy cachefile Ns = Ns Sy none , though this may be overridden using an explicit setting. .El .Pp The following property can be set only at import time: .Bl -tag -width Ds .It Sy readonly Ns = Ns Sy on Ns | Ns Sy off If set to .Sy on , the pool will be imported in read-only mode. This property can also be referred to by its shortened column name, .Sy rdonly . .El .Pp The following properties can be set at creation time and import time, and later changed with the .Nm zpool Cm set command: .Bl -tag -width Ds .It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off Controls automatic pool expansion when the underlying LUN is grown. If set to .Sy on , the pool will be resized according to the size of the expanded device. If the device is part of a mirror or raidz then all devices within that mirror/raidz group must be expanded before the new space is made available to the pool. The default behavior is .Sy off . This property can also be referred to by its shortened column name, .Sy expand . .It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off Controls automatic device replacement. If set to .Sy off , device replacement must be initiated by the administrator by using the .Nm zpool Cm replace command. If set to .Sy on , any new device, found in the same physical location as a device that previously belonged to the pool, is automatically formatted and replaced. The default behavior is .Sy off . This property can also be referred to by its shortened column name, .Sy replace . .It Sy bootfs Ns = Ns Ar pool Ns / Ns Ar dataset Identifies the default bootable dataset for the root pool. This property is expected to be set mainly by the installation and upgrade programs. .It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none Controls the location of where the pool configuration is cached. Discovering all pools on system startup requires a cached copy of the configuration data that is stored on the root file system. All pools in this cache are automatically imported when the system boots. Some environments, such as install and clustering, need to cache this information in a different location so that pools are not automatically imported. Setting this property caches the pool configuration in a different location that can later be imported with .Nm zpool Cm import Fl c . Setting it to the special value .Sy none creates a temporary pool that is never cached, and the special value .Qq .Pq empty string uses the default location. .Pp Multiple pools can share the same cache file. Because the kernel destroys and recreates this file when pools are added and removed, care should be taken when attempting to access this file. When the last pool using a .Sy cachefile is exported or destroyed, the file is removed. .It Sy comment Ns = Ns Ar text A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted. An administrator can provide additional information about a pool using this property. .It Sy dedupditto Ns = Ns Ar number Threshold for the number of block ditto copies. If the reference count for a deduplicated block increases above this number, a new ditto copy of this block is automatically stored. The default setting is .Sy 0 which causes no ditto copies to be created for deduplicated blocks. The minimum legal nonzero setting is .Sy 100 . .It Sy delegation Ns = Ns Sy on Ns | Ns Sy off Controls whether a non-privileged user is granted access based on the dataset permissions defined on the dataset. See .Xr zfs 1M for more information on ZFS delegated administration. .It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic Controls the system behavior in the event of catastrophic pool failure. This condition is typically a result of a loss of connectivity to the underlying storage device(s) or a failure of all devices within the pool. The behavior of such an event is determined as follows: .Bl -tag -width "continue" .It Sy wait Blocks all I/O access until the device connectivity is recovered and the errors are cleared. This is the default behavior. .It Sy continue Returns .Er EIO to any new write I/O requests but allows reads to any of the remaining healthy devices. Any write requests that have yet to be committed to disk would be blocked. .It Sy panic Prints out a message to the console and generates a system crash dump. .El .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled The value of this property is the current state of .Ar feature_name . The only valid value when setting this property is .Sy enabled which moves .Ar feature_name to the enabled state. See .Xr zpool-features 5 for details on feature states. .It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off Controls whether information about snapshots associated with this pool is output when .Nm zfs Cm list is run without the .Fl t option. The default value is .Sy off . This property can also be referred to by its shortened name, .Sy listsnaps . .It Sy version Ns = Ns Ar version The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the .Nm zpool Cm upgrade command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. .El .Ss Subcommands All subcommands that modify state are logged persistently to the pool in their original form. .Pp The .Nm command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported: .Bl -tag -width Ds .It Xo .Nm .Fl \? .Xc Displays a help message. .It Xo .Nm .Cm add .Op Fl fn .Ar pool vdev Ns ... .Xc Adds the specified virtual devices to the given pool. The .Ar vdev specification is described in the .Sx Virtual Devices section. The behavior of the .Fl f option, and the device checks performed are described in the .Nm zpool Cm create subcommand. .Bl -tag -width Ds .It Fl f Forces use of .Ar vdev Ns s , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl n Displays the configuration that would be used without actually adding the .Ar vdev Ns s . The actual pool creation can still fail due to insufficient privileges or device sharing. .El .It Xo .Nm .Cm attach .Op Fl f .Ar pool device new_device .Xc Attaches .Ar new_device to the existing .Ar device . The existing device cannot be part of a raidz configuration. If .Ar device is not currently part of a mirrored configuration, .Ar device automatically transforms into a two-way mirror of .Ar device and .Ar new_device . If .Ar device is part of a two-way mirror, attaching .Ar new_device creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .El .It Xo .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Xc Checkpoints the current state of .Ar pool , which can be later restored by .Nm zpool Cm import --rewind-to-checkpoint . The existence of a checkpoint in a pool prohibits the following .Nm zpool commands: .Cm remove , .Cm attach , .Cm detach , .Cm split , and .Cm reguid . In addition, it may break reservation boundaries if the pool lacks free space. The .Nm zpool Cm status command indicates the existence of a checkpoint or the progress of discarding a checkpoint from a pool. The .Nm zpool Cm list command reports how much space the checkpoint takes from the pool. .Bl -tag -width Ds .It Fl d, -discard Discards an existing checkpoint from .Ar pool . .El .It Xo .Nm .Cm clear .Ar pool .Op Ar device .Xc Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared. .It Xo .Nm .Cm create .Op Fl dfn .Op Fl B .Op Fl m Ar mountpoint .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root +.Op Fl t Ar tempname .Ar pool vdev Ns ... .Xc Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore .Pq Qq Sy _ , dash .Pq Qq Sy - , and period .Pq Qq Sy \&. . The pool names .Sy mirror , .Sy raidz , .Sy spare and .Sy log are reserved, as are names beginning with the pattern .Sy c[0-9] . The .Ar vdev specification is described in the .Sx Virtual Devices section. .Pp The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by ZFS. Other uses, such as having a preexisting UFS file system, can be overridden with the .Fl f option. .Pp The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless .Fl f is specified. The use of differently sized devices within a single raidz or mirror group is also flagged as an error unless .Fl f is specified. .Pp Unless the .Fl R option is specified, the default mount point is .Pa / Ns Ar pool . The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the .Fl m option. .Pp By default all supported features are enabled on the new pool unless the .Fl d option is specified. .Bl -tag -width Ds .It Fl B Create whole disk pool with EFI System partition to support booting system with UEFI firmware. Default size is 256MB. To create boot partition with custom size, set the .Sy bootsize property with the .Fl o option. See the .Sx Properties section for details. .It Fl d Do not enable any features on the new pool. Individual features can be enabled by setting their corresponding properties to .Sy enabled with the .Fl o option. See .Xr zpool-features 5 for details about feature properties. .It Fl f Forces use of .Ar vdev Ns s , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl m Ar mountpoint Sets the mount point for the root dataset. The default mount point is .Pa /pool or .Pa altroot/pool if .Ar altroot is specified. The mount point must be an absolute path, .Sy legacy , or .Sy none . For more information on dataset mount points, see .Xr zfs 1M . .It Fl n Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Sx Properties section for a list of valid properties that can be set. .It Fl O Ar file-system-property Ns = Ns Ar value Sets the given file system properties in the root file system of the pool. See the .Sx Properties section of .Xr zfs 1M for a list of valid properties that can be set. .It Fl R Ar root Equivalent to .Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root +.It Fl t Ar tempname +Sets the in-core pool name to +.Pa tempname +while the on-disk name will be the name specified as the pool name +.Pa pool . +This will set the default cachefile property to +.Sy none. +This is intended to handle name space collisions when creating pools +for other systems, such as virtual machines or physical machines +whose pools live on network block devices. .El .It Xo .Nm .Cm destroy .Op Fl f .Ar pool .Xc Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool. .Bl -tag -width Ds .It Fl f Forces any active datasets contained within the pool to be unmounted. .El .It Xo .Nm .Cm detach .Ar pool device .Xc Detaches .Ar device from a mirror. The operation is refused if there are no other valid replicas of the data. .It Xo .Nm .Cm export .Op Fl f .Ar pool Ns ... .Xc Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems .Pq even those of different endianness and imported as long as a sufficient number of devices are present. .Pp Before exporting the pool, all datasets within the pool are unmounted. A pool can not be exported if it has a shared spare that is currently being used. .Pp For pools to be portable, you must give the .Nm command whole disks, not just slices, so that ZFS can label the disks with portable EFI labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks. .Bl -tag -width Ds .It Fl f Forcefully unmount all datasets, using the .Nm unmount Fl f command. .Pp This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption. .El .It Xo .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... .Ar pool Ns ... .Xc Retrieves the given list of properties .Po or all properties if .Sy all is used .Pc for the specified storage pool(s). These properties are displayed with the following fields: .Bd -literal name Name of storage pool property Property name value Property value source Property source, either 'default' or 'local'. .Ed .Pp See the .Sx Properties section for more information on the available pool properties. .Bl -tag -width Ds .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl o Ar field A comma-separated list of columns to display. .Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source is the default value. .It Fl p Display numbers in parsable (exact) values. .El .It Xo .Nm .Cm history .Op Fl il .Oo Ar pool Oc Ns ... .Xc Displays the command history of the specified pool(s) or all pools if no pool is specified. .Bl -tag -width Ds .It Fl i Displays internally logged ZFS events in addition to user initiated events. .It Fl l Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed. .El .It Xo .Nm .Cm import .Op Fl D .Op Fl d Ar dir .Xc Lists pools available to import. If the .Fl d option is not specified, this command searches for devices in .Pa /dev/dsk . The .Fl d option can be specified multiple times, and all directories are searched. If the device appears to be part of an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the vdev layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the .Nm zpool Cm destroy command, are not listed unless the .Fl D option is specified. .Pp The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available. .Bl -tag -width Ds .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. .It Fl D Lists destroyed pools only. .El .It Xo .Nm .Cm import .Fl a .Op Fl DfmN .Op Fl F Op Fl n .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Xc Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the .Nm zpool Cm destroy command, will not be imported unless the .Fl D option is specified. .Bl -tag -width Ds .It Fl a Searches for and imports all pools found. .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pools only. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl N Import the pool without mounting any file systems. .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 1M for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Sets the .Sy cachefile property to .Sy none and the .Sy altroot property to .Ar root . .El .It Xo .Nm .Cm import -.Op Fl Dfm +.Op Fl Dfmt .Op Fl F Op Fl n .Op Fl -rewind-to-checkpoint .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool Ns | Ns Ar id .Op Ar newpool .Xc Imports a specific pool. A pool can be identified by its name or the numeric identifier. If .Ar newpool is specified, the pool is imported using the name .Ar newpool . Otherwise, it is imported with the same name as its exported name. .Pp If a device is removed from a system without running .Nm zpool Cm export first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state, the .Fl f option is required. .Bl -tag -width Ds .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pool. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 1M for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Sets the .Sy cachefile property to .Sy none and the .Sy altroot property to .Ar root . +.It Fl t +Used with +.Ar newpool . +Specifies that +.Ar newpool +is temporary. +Temporary pool names last until export. +Ensures that the original pool name will be used in all label updates and +therefore is retained upon export. +Will also set +.Sy cachefile +property to +.Sy none +when not explicitly specified. .It Fl -rewind-to-checkpoint Rewinds pool to the checkpointed state. Once the pool is imported with this flag there is no way to undo the rewind. All changes and data that were written after the checkpoint are lost! The only exception is when the .Sy readonly mounting option is enabled. In this case, the checkpointed state of the pool is opened and an administrator can see how the pool would look like if they were to fully rewind. .El .It Xo .Nm .Cm initialize .Op Fl cs .Ar pool .Op Ar device Ns ... .Xc Begins initializing by writing to all unallocated regions on the specified devices, or all eligible devices in the pool if no individual devices are specified. Only leaf data or log devices may be initialized. .Bl -tag -width Ds .It Fl c, -cancel Cancel initializing on the specified devices, or all eligible devices if none are specified. If one or more target devices are invalid or are not currently being initialized, the command will fail and no cancellation will occur on any device. .It Fl s -suspend Suspend initializing on the specified devices, or all eligible devices if none are specified. If one or more target devices are invalid or are not currently being initialized, the command will fail and no suspension will occur on any device. Initializing can then be resumed by running .Nm zpool Cm initialize with no flags on the relevant target devices. .El .It Xo .Nm .Cm iostat .Op Fl v .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc Displays I/O statistics for the given pools. When given an .Ar interval , the statistics are printed every .Ar interval seconds until ^C is pressed. If no .Ar pool Ns s are specified, statistics for every pool in the system is shown. If .Ar count is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Sy u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Sy d for standard date format. See .Xr date 1 . .It Fl v Verbose statistics Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. .El .It Xo .Nm .Cm labelclear .Op Fl f .Ar device .Xc Removes ZFS label information from the specified .Ar device . The .Ar device must not be part of an active pool configuration. .Bl -tag -width Ds .It Fl f Treat exported or foreign devices as inactive. .El .It Xo .Nm .Cm list .Op Fl Hpv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc Lists the given pools along with a health status and space usage. If no .Ar pool Ns s are specified, all pools in the system are listed. When given an .Ar interval , the information is printed every .Ar interval seconds until ^C is pressed. If .Ar count is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl o Ar property Comma-separated list of properties to display. See the .Sx Properties section for a list of valid properties. The default list is .Sy name , size , used , available , fragmentation , expandsize , capacity , .Sy dedupratio , health , altroot . .It Fl p Display numbers in parsable .Pq exact values. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Fl u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Fl d for standard date format. See .Xr date 1 . .It Fl v Verbose statistics. Reports usage statistics for individual vdevs within the pool, in addition to the pool-wise statistics. .El .It Xo .Nm .Cm offline .Op Fl t .Ar pool Ar device Ns ... .Xc Takes the specified physical device offline. While the .Ar device is offline, no attempt is made to read or write to the device. This command is not applicable to spares. .Bl -tag -width Ds .It Fl t Temporary. Upon reboot, the specified physical device reverts to its previous state. .El .It Xo .Nm .Cm online .Op Fl e .Ar pool Ar device Ns ... .Xc Brings the specified physical device online. This command is not applicable to spares. .Bl -tag -width Ds .It Fl e Expand the device to use all available space. If the device is part of a mirror or raidz then all devices must be expanded before the new space will become available to the pool. .El .It Xo .Nm .Cm reguid .Ar pool .Xc Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and healthy before performing this action. .It Xo .Nm .Cm reopen .Ar pool .Xc Reopen all the vdevs associated with the pool. .It Xo .Nm .Cm remove .Op Fl np .Ar pool Ar device Ns ... .Xc Removes the specified device from the pool. This command currently only supports removing hot spares, cache, log devices and mirrored top-level vdevs (mirror of leaf devices); but not raidz. .sp Removing a top-level vdev reduces the total amount of space in the storage pool. The specified device will be evacuated by copying all allocated space from it to the other devices in the pool. In this case, the .Nm zpool Cm remove command initiates the removal and returns, while the evacuation continues in the background. The removal progress can be monitored with .Nm zpool Cm status. This feature must be enabled to be used, see .Xr zpool-features 5 .Pp A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the same. Non-log devices or data devices that are part of a mirrored configuration can be removed using the .Nm zpool Cm detach command. .Bl -tag -width Ds .It Fl n Do not actually perform the removal ("no-op"). Instead, print the estimated amount of memory that will be used by the mapping table after the removal completes. This is nonzero only for top-level vdevs. .El .Bl -tag -width Ds .It Fl p Used in conjunction with the .Fl n flag, displays numbers as parsable (exact) values. .El .It Xo .Nm .Cm remove .Fl s .Ar pool .Xc Stops and cancels an in-progress removal of a top-level vdev. .It Xo .Nm .Cm replace .Op Fl f .Ar pool Ar device Op Ar new_device .Xc Replaces .Ar old_device with .Ar new_device . This is equivalent to attaching .Ar new_device , waiting for it to resilver, and then detaching .Ar old_device . .Pp The size of .Ar new_device must be greater than or equal to the minimum size of all the devices in a mirror or raidz configuration. .Pp .Ar new_device is required if the pool is not redundant. If .Ar new_device is not specified, it defaults to .Ar old_device . This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same .Pa /dev/dsk path as the old device, even though it is actually a different disk. ZFS recognizes this. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .El .It Xo .Nm .Cm scrub .Op Fl s | Fl p .Ar pool Ns ... .Xc Begins a scrub or resumes a paused scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated .Pq mirror or raidz devices, ZFS automatically repairs any damage discovered during the scrub. The .Nm zpool Cm status command reports the progress of the scrub and summarizes the results of the scrub upon completion. .Pp Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that ZFS knows to be out of date .Po for example, when attaching a new device to a mirror or replacing an existing device .Pc , whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure. .Pp Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows one at a time. If a scrub is paused, the .Nm zpool Cm scrub resumes it. If a resilver is in progress, ZFS does not allow a scrub to be started until the resilver completes. .Bl -tag -width Ds .It Fl s Stop scrubbing. .El .Bl -tag -width Ds .It Fl p Pause scrubbing. Scrub pause state and progress are periodically synced to disk. If the system is restarted or pool is exported during a paused scrub, even after import, scrub will remain paused until it is resumed. Once resumed the scrub will pick up from the place where it was last checkpointed to disk. To resume a paused scrub issue .Nm zpool Cm scrub again. .El .It Xo .Nm .Cm set .Ar property Ns = Ns Ar value .Ar pool .Xc Sets the given property on the specified pool. See the .Sx Properties section for more information on what properties can be set and acceptable values. .It Xo .Nm .Cm split .Op Fl n .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool .Xc Splits devices off .Ar pool creating .Ar newpool . All vdevs in .Ar pool must be mirrors. At the time of the split, .Ar newpool will be a replica of .Ar pool . .Bl -tag -width Ds .It Fl n Do dry run, do not actually perform the split. Print out the expected configuration of .Ar newpool . .It Fl o Ar property Ns = Ns Ar value Sets the specified property for .Ar newpool . See the .Sx Properties section for more information on the available pool properties. .It Fl R Ar root Set .Sy altroot for .Ar newpool to .Ar root and automatically import it. .El .It Xo .Nm .Cm status .Op Fl Dvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc Displays the detailed health status for the given pools. If no .Ar pool is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the .Sx Device Failure and Recovery section. .Pp If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width Ds .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk and referenced .Pq logically referenced in the pool block counts and sizes by reference count. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify .Fl u for a printed representation of the internal representation of time. See .Xr time 2 . Specify .Fl d for standard date format. See .Xr date 1 . .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. Warnings about pools not using the latest on-disk format will not be included. .El .It Xo .Nm .Cm upgrade .Xc Displays pools which do not have all supported features enabled and pools formatted using a legacy ZFS version number. These pools can continue to be used, but some features may not be available. Use .Nm zpool Cm upgrade Fl a to enable all features on all pools. .It Xo .Nm .Cm upgrade .Fl v .Xc Displays legacy ZFS versions supported by the current software. See .Xr zpool-features 5 for a description of feature flags features supported by the current software. .It Xo .Nm .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... .Xc Enables all supported features on the given pool. Once this is done, the pool will no longer be accessible on systems that do not support feature flags. See .Xr zpool-features 5 for details on compatibility with systems that support feature flags, but do not support all features enabled on the pool. .Bl -tag -width Ds .It Fl a Enables all supported features on all pools. .It Fl V Ar version Upgrade to the specified legacy version. If the .Fl V flag is specified, no features will be enabled on the pool. This option can only be used to increase the version number up to the last supported legacy version number. .El .El .Sh EXIT STATUS The following exit values are returned: .Bl -tag -width Ds .It Sy 0 Successful completion. .It Sy 1 An error occurred. .It Sy 2 Invalid command line options were specified. .El .Sh EXAMPLES .Bl -tag -width Ds .It Sy Example 1 No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that consists of six disks. .Bd -literal # zpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0 .Ed .It Sy Example 2 No Creating a Mirrored Storage Pool The following command creates a pool with two mirrors, where each mirror contains two disks. .Bd -literal # zpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0 .Ed .It Sy Example 3 No Creating a ZFS Storage Pool by Using Slices The following command creates an unmirrored pool using two disk slices. .Bd -literal # zpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4 .Ed .It Sy Example 4 No Creating a ZFS Storage Pool by Using Files The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes. .Bd -literal # zpool create tank /path/to/file/a /path/to/file/b .Ed .It Sy Example 5 No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool .Em tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Bd -literal # zpool add tank mirror c1t0d0 c1t1d0 .Ed .It Sy Example 6 No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool .Em zion is faulted due to a missing device. The results from this command are similar to the following: .Bd -literal # zpool list NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT rpool 19.9G 8.43G 11.4G 33% - 42% 1.00x ONLINE - tank 61.5G 20.0G 41.5G 48% - 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed .It Sy Example 7 No Destroying a ZFS Storage Pool The following command destroys the pool .Em tank and any datasets contained within. .Bd -literal # zpool destroy -f tank .Ed .It Sy Example 8 No Exporting a ZFS Storage Pool The following command exports the devices in pool .Em tank so that they can be relocated or later imported. .Bd -literal # zpool export tank .Ed .It Sy Example 9 No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool .Em tank for use on the system. The results from this command are similar to the following: .Bd -literal # zpool import pool: tank id: 15451357997522795478 state: ONLINE action: The pool can be imported using its name or numeric identifier. config: tank ONLINE mirror ONLINE c1t2d0 ONLINE c1t3d0 ONLINE # zpool import tank .Ed .It Sy Example 10 No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of the software. .Bd -literal # zpool upgrade -a This system is currently running ZFS version 2. .Ed .It Sy Example 11 No Managing Hot Spares The following command creates a new pool with an available hot spare: .Bd -literal # zpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0 .Ed .Pp If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command: .Bd -literal # zpool replace tank c0t0d0 c0t3d0 .Ed .Pp Once the data has been resilvered, the spare is automatically removed and is made available for use should another device fail. The hot spare can be permanently removed from the pool using the following command: .Bd -literal # zpool remove tank c0t2d0 .Ed .It Sy Example 12 No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Bd -literal # zpool create pool mirror c0d0 c1d0 mirror c2d0 c3d0 log mirror \e c4d0 c5d0 .Ed .It Sy Example 13 No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: .Bd -literal # zpool add pool cache c2d0 c3d0 .Ed .Pp Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the .Cm iostat option as follows: .Bd -literal # zpool iostat -v pool 5 .Ed .It Sy Example 14 No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device .Sy mirror-1 . .Pp Given this configuration: .Bd -literal pool: tank state: ONLINE scrub: none requested config: NAME STATE READ WRITE CKSUM tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 c6t0d0 ONLINE 0 0 0 c6t1d0 ONLINE 0 0 0 mirror-1 ONLINE 0 0 0 c6t2d0 ONLINE 0 0 0 c6t3d0 ONLINE 0 0 0 logs mirror-2 ONLINE 0 0 0 c4t0d0 ONLINE 0 0 0 c4t1d0 ONLINE 0 0 0 .Ed .Pp The command to remove the mirrored log .Sy mirror-2 is: .Bd -literal # zpool remove tank mirror-2 .Ed .Pp The command to remove the mirrored data .Sy mirror-1 is: .Bd -literal # zpool remove tank mirror-1 .Ed .It Sy Example 15 No Displaying expanded space on a device The following command displays the detailed information for the pool .Em data . This pool is comprised of a single raidz vdev where one of its devices increased its capacity by 10GB. In this example, the pool will not be able to utilize this extra capacity until all the devices under the raidz vdev have been expanded. .Bd -literal # zpool list -v data NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE - raidz1 23.9G 14.6G 9.30G 48% - c1t1d0 - - - - - c1t2d0 - - - - 10G c1t3d0 - - - - - .Ed .El .Sh INTERFACE STABILITY .Sy Evolving .Sh SEE ALSO .Xr zfs 1M , .Xr attributes 5 , .Xr zpool-features 5 Index: vendor-sys/illumos/dist/common/zfs/zpool_prop.c =================================================================== --- vendor-sys/illumos/dist/common/zfs/zpool_prop.c (revision 342531) +++ vendor-sys/illumos/dist/common/zfs/zpool_prop.c (revision 342532) @@ -1,243 +1,245 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include "zfs_prop.h" #if defined(_KERNEL) #include #else #include #include #include #endif static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; zprop_desc_t * zpool_prop_get_table(void) { return (zpool_prop_table); } void zpool_prop_init(void) { static zprop_index_t boolean_table[] = { { "off", 0}, { "on", 1}, { NULL } }; static zprop_index_t failuremode_table[] = { { "wait", ZIO_FAILURE_MODE_WAIT }, { "continue", ZIO_FAILURE_MODE_CONTINUE }, { "panic", ZIO_FAILURE_MODE_PANIC }, { NULL } }; /* string properties */ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ALTROOT"); zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "BOOTFS"); zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); /* readonly number properties */ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREE"); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREEING"); zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CKPOINT"); zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "LEAKED"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP"); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID"); zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "HEALTH"); zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP"); /* system partition size */ zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME, ZFS_TYPE_POOL, "", "BOOTSIZE"); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); /* default index (boolean) properties */ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table); zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); /* default index properties */ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table); /* hidden properties */ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME"); zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); + zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, + PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); } /* * Given a property name and its type, returns the corresponding property ID. */ zpool_prop_t zpool_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); } /* * Given a pool property ID, returns the corresponding name. * Assuming the pool propety ID is valid. */ const char * zpool_prop_to_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_name); } zprop_type_t zpool_prop_get_type(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_proptype); } boolean_t zpool_prop_readonly(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_attr == PROP_READONLY); } const char * zpool_prop_default_string(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_strdefault); } uint64_t zpool_prop_default_numeric(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_numdefault); } /* * Returns true if this is a valid feature@ property. */ boolean_t zpool_prop_feature(const char *name) { static const char *prefix = "feature@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Returns true if this is a valid unsupported@ property. */ boolean_t zpool_prop_unsupported(const char *name) { static const char *prefix = "unsupported@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); } int zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); } uint64_t zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); } #ifndef _KERNEL const char * zpool_prop_values(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_values); } const char * zpool_prop_column_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_colname); } boolean_t zpool_prop_align_right(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_rightalign); } #endif Index: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c (revision 342531) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c (revision 342532) @@ -1,8086 +1,8101 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2018 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ id_t zio_taskq_psrset_bind = PS_NONE; boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ extern int zfs_sync_pass_deferred_free; /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ boolean_t spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ boolean_t zfs_pause_spa_sync = B_FALSE; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* * Must be ZPL, and its property settings * must be supported by GRUB (compression * is not gzip, and large blocks are not used). */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), &propval)) == 0 && !BOOTFS_COMPRESS_VALID(propval)) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { /* * The kernel doesn't have an easy isprint() * check. For this kernel check, we merely * check ASCII apart from DEL. Fix this if * there is an easy-to-use kernel isprint(). */ if (*check >= 0x7f) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = SET_ERROR(ENOTSUP); else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = SET_ERROR(EINVAL); break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { spa_error_entry_t *sa = (spa_error_entry_t *)a; spa_error_entry_t *sb = (spa_error_entry_t *)b; int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; if (count > 1) { (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); } else { (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); } if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly lower priority * than the other taskqs. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) pri--; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT0(tqs->stqs_count); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, arg, flags, ent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #ifdef _KERNEL static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_load_note(spa, "UNLOADING"); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_initialize_stop_all(spa->spa_root_vdev, VDEV_INITIALIZE_ACTIVE); } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Even though vdev_free() also calls vdev_metaslab_fini, we need * to call it earlier, before we wait for async i/o to complete. * This ensures that there is no async metaslab prefetching, by * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); spa_config_exit(spa, SCL_ALL, spa); } /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } if (spa->spa_condense_zthr != NULL) { ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr)); zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; newvdevs = NULL; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error != 0) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of concurrent scrub i/os to create while verifying * a pool while importing it. */ int spa_load_verify_maxinflight = 10000; boolean_t spa_load_verify_metadata = B_TRUE; boolean_t spa_load_verify_data = B_TRUE; /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } /* ARGSUSED */ int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, spa_load_verify_cb, rio); } (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create(spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; char *comment; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the secret checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { int mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open"); /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_restart_removal(spa); spa_spawn_aux_threads(spa); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features) == 0); nvlist_free(features); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } /* * The L2ARC currently only supports disk devices in * kernel context. For user-level testing, we allow it. */ #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = SET_ERROR(ENOTBLK); vdev_free(vd); goto out; } #endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; + char *poolname; + nvlist_t *nvl; + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) + poolname = (char *)pool; + /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { + if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, NULL, altroot); + spa = spa_add(poolname, nvl, altroot); + fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } + + /* + * Temporary pool names should never be written to disk. + */ + if (poolname != pool) + spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) has_features = B_TRUE; } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create"); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { for (int c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (SET_ERROR(EIO)); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = SET_ERROR(ENOENT); goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = SET_ERROR(EINVAL); goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = SET_ERROR(EINVAL); goto out; } error = 0; out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #endif /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import"); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We're about to export or destroy this pool. Make sure * we stop all initializtion activity here before we * set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ if (spa->spa_root_vdev != NULL) { vdev_initialize_stop_all(spa->spa_root_vdev, VDEV_INITIALIZE_ACTIVE); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz toplevels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ if (tvd->vdev_ops == &vdev_raidz_ops) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilver_txg = txg; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) { /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_DO && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_INITIALIZE_DO: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); return (0); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c])) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); for (c = 0; c < children; c++) { if (vml[c] != NULL) { /* * Temporarily stop the initializing activity. We set * the state to ACTIVE so that we know to resume * the initializing once the split has completed. */ mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); mutex_exit(&vml[c]->vdev_initialize_lock); } } newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { sysevent_id_t eid; nvlist_t *attr; char *physpath; if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_DEV_DLE, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } static void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL && zthr_isrunning(condense_thread)) VERIFY0(zthr_cancel(condense_thread)); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL && zthr_isrunning(discard_thread)) VERIFY0(zthr_cancel(discard_thread)); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL && !zthr_isrunning(condense_thread)) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL && !zthr_isrunning(discard_thread)) zthr_resume(discard_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < (zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, zio->io_flags)); return (0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced seperatly before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuratoin has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } if (vdev_obsolete_sm_object(vd) != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, spa->spa_sync_starttime + spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * Set the top-level vdev's max queue depth. Evaluate each * top-level's async write queue depth in case it changed. * The max queue depth will not change in the middle of syncing * out this txg. */ uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || mg->mg_class != spa_normal_class(spa) || !metaslab_group_initialized(mg)) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < spa->spa_alloc_count; i++) { mg->mg_cur_max_alloc_queue_depth[i] = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); mc->mc_alloc_max_slots[i] = slots_per_allocator; } mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); if (spa->spa_vdev_removal != NULL) svr_sync(spa, tx); while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); if (pass == 1) { spa_sync_upgrades(spa, tx); ASSERT3U(txg, >=, spa->spa_uberblock.ub_rootbp.blk_birth); /* * Note: We need to check if the MOS is dirty * because we could have marked the MOS dirty * without updating the uberblock (e.g. if we * have sync tasks but no dirty user data). We * need to check the uberblock's rootbp because * it is updated if we have synced out dirty * data (though in this case the MOS will most * likely also be dirty due to second order * effects, we don't want to rely on that here). */ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, * therefore this TXG is a no-op. Avoid * syncing deferred frees, so that we * can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } } while (dmu_objset_is_dirty(mos, txg)); if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL); zio_resume_wait(spa); } dmu_tx_commit(tx); VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL sysevent_attr_list_t *attr = NULL; sysevent_value_t value; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); ASSERT(ev != NULL); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) goto done; value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = spa_guid(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) goto done; if (vd) { value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = vd->vdev_guid; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, SE_SLEEP) != 0) goto done; if (vd->vdev_path) { value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = vd->vdev_path; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, &value, SE_SLEEP) != 0) goto done; } } if (hist_nvl != NULL) { fnvlist_merge((nvlist_t *)attr, hist_nvl); } if (sysevent_attach_attributes(ev, attr) != 0) goto done; attr = NULL; done: if (attr) sysevent_free_attr(attr); #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL sysevent_id_t eid; (void) log_sysevent(ev, SE_SLEEP, &eid); sysevent_free(ev); #endif } void spa_event_discard(sysevent_t *ev) { #ifdef _KERNEL sysevent_free(ev); #endif } /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev and history nvl. This * doesn't do anything in the userland libzpool, as we don't want consumers to * misinterpret ztest or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_config.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_config.c (revision 342531) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_config.c (revision 342532) @@ -1,547 +1,573 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ const char *spa_config_path = ZPOOL_CACHE; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; struct _buf *file; uint64_t fsize; /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s%s", (rootdir != NULL) ? "./" : "", spa_config_path); file = kobj_open_file(pathname); kmem_free(pathname, MAXPATHLEN); if (file == (struct _buf *)-1) return; if (kobj_get_filesize(file, &fsize) != 0) goto out; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (kobj_read_file(file, buf, fsize, 0) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); kobj_close_file(file); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; char *temp; int err; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. We need to do the traditional * 'write to temporary file, sync, move over original' to make sure we * always have a consistent view of the data. */ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); if (err == 0) { err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, NULL); if (err == 0) err = VOP_FSYNC(vp, FSYNC, kcred, NULL); if (err == 0) err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); VN_RELE(vp); } (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; boolean_t ccw_failure; int error; + char *pool_name; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); - fnvlist_add_nvlist(nvl, spa->spa_name, + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { + pool_name = fnvlist_lookup_string( + spa->spa_config, ZPOOL_CONFIG_POOL_NAME); + } else { + pool_name = spa_name(spa); + } + + fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; + char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; + /* + * Originally, users had to handle spa namespace collisions by either + * exporting the already imported pool or by specifying a new name for + * the pool with a conflicting name. In the case of root pools from + * virtual guests, neither approach to collision resolution is + * reasonable. This is addressed by extending the new name syntax with + * an option to specify that the new name is temporary. When specified, + * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us + * to use the previous name, which we do below. + */ + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { + pool_name = fnvlist_lookup_string(spa->spa_config, + ZPOOL_CONFIG_POOL_NAME); + } else { + pool_name = spa_name(spa); + } + config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); - fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); if (spa->spa_comment != NULL) { fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); } hostid = zone_get_hostid(NULL); if (hostid != 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); } fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) { fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); } if (vd->vdev_islog) { fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); } vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c (revision 342531) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c (revision 342532) @@ -1,6561 +1,6567 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Datto Inc. */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * The ioctl numbers can change from release to release, because * the caller (libzfs) must be matched to the kernel. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include "lua.h" #include "lauxlib.h" extern struct modlfs zfs_modlfs; extern void zfs_init(void); extern void zfs_fini(void); ldi_ident_t zfs_li = NULL; dev_info_t *zfs_dip; uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; static uint_t zfs_allow_log_key; typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); typedef enum { NO_NAME, POOL_NAME, DATASET_NAME } zfs_ioc_namecheck_t; typedef enum { POOL_CHECK_NONE = 1 << 0, POOL_CHECK_SUSPENDED = 1 << 1, POOL_CHECK_READONLY = 1 << 2, } zfs_ioc_poolcheck_t; typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char buf[512]; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); /* * To get this data, use the zfs-dprintf probe as so: * dtrace -q -n 'zfs-dprintf \ * /stringof(arg0) == "dbuf.c"/ \ * {printf("%s: %s", stringof(arg1), stringof(arg3))}' * arg0 = file name * arg1 = function name * arg2 = line number * arg3 = message */ DTRACE_PROBE4(zfs__dprintf, char *, newfile, char *, func, int, line, char *, buf); } static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == NULL) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Check to see if the named dataset is currently defined as bootable */ static boolean_t zfs_is_bootfs(const char *name) { objset_t *os; if (dmu_objset_hold(name, FTAG, &os) == 0) { boolean_t ret; ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); dmu_objset_rele(os, FTAG); return (ret); } return (B_FALSE); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "zoned", &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } /* ARGSUSED */ static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (vp->v_vfsp->vfs_fstype != zfsfstype || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EPERM)); } VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); if (secpolicy_nfs(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); if (secpolicy_smb(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error = 0; nvpair_t *pair; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_REMAP, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ static int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else { if (groupmember(zc->zc_guid, cr)) return (0); } } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; error = nvlist_lookup_nvlist(innvl, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { VFS_HOLD((*zfvp)->z_vfs); } else { error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrm_exit(&(*zfvp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; + char *spa_name = zc->zc_name; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; uint64_t version = SPA_VERSION; + char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) { nvlist_free(config); nvlist_free(props); return (error); } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) + spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops); /* * Set the remaining root properties */ - if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, + if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) - (void) spa_destroy(zc->zc_name); + (void) spa_destroy(spa_name); pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); kmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. * * Intent log device can not be added to a rootpool because * during mountroot, zil is replayed, a seperated log device * can not be accessed during the mountroot time. * * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); return (SET_ERROR(EDOM)); } if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; int replacing = zc->zc_cookie; nvlist_t *config; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); VERIFY0(error); } error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != NULL && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } static boolean_t dataset_name_hidden(const char *name) { /* * Skip over datasets that are not visible in this zone, * internal datasets (which have a $ in their name), and * temporary datasets (which have a % in their name). */ if (strchr(name, '$') != NULL) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * zc_simple when set, only name is requested * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == 0 && !zc->zc_simple) { dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); if (error == 0) error = zfs_ioc_objset_stats_impl(zc, ossnap); dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = SET_ERROR(ESRCH); } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) return (-1); VERIFY(0 == nvpair_value_uint64(pair, &intval)); switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dsname); (void) zfs_ioc_userspace_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(EINVAL); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(const char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED())) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != NULL && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ if (received) { nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: nvlist_free(dummy); return (SET_ERROR(EINVAL)); } pair = nvlist_next_nvpair(dummy, NULL); err = zfs_prop_set_special(zc->zc_name, source, pair); nvlist_free(dummy); if (err != -1) return (err); /* special property already handled */ } else { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } /* property name has been validated by zfs_secpolicy_inherit_prop() */ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != NULL) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; ASSERT(zplprops != NULL); if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(EINVAL)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; spa_t *spa; uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); cp = strrchr(parentname, '/'); ASSERT(cp != NULL); cp[0] = '\0'; if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; if (nvlist_lookup_int32(innvl, "type", &type32) != 0) return (SET_ERROR(EINVAL)); type = type32; (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize( volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; char *origin_name; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); if (error != 0) return (error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); return (dmu_objset_remap_indirects(fsname)); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if ((error = zfs_check_userprops(poolname, props)) != 0) return (error); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char *message; spa_t *spa; int error; char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); strfree(poolname); if (error != 0) return (error); if (nvlist_lookup_string(innvl, "message", &message) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { vfs_t *vfsp = NULL; zfsvfs_t *zfsvfs = NULL; if (strchr(snapname, '@') == NULL) return; int err = getzfsvfs(snapname, &zfsvfs); if (err != 0) { ASSERT3P(zfsvfs, ==, NULL); return; } vfsp = zfsvfs->z_vfs; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); err = vn_vfswlock(vfsp->vfs_vnodecovered); VFS_RELE(vfsp); if (err != 0) return; /* * Always force the unmount for snapshots. */ (void) dounmount(vfsp, MS_FORCE, kcred); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvpair_t *pair; boolean_t defer; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); defer = nvlist_exists(innvl, "defer"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { zfs_unmount_snap(nvpair_name(pair)); } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. Bookmark names are of the form #. * All bookmarks must be in the same pool. * * innvl: { * bookmark1 -> snapshot1, bookmark2 -> snapshot2 * } * * outnvl: bookmark -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *snap_name; /* * Verify the snapshot argument. */ if (nvpair_value_string(pair, &snap_name) != 0) return (SET_ERROR(EINVAL)); /* Verify that the keys (bookmarks) are unique */ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) return (SET_ERROR(EINVAL)); } } return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) { return (EINVAL); } if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) { return (EINVAL); } if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (EINVAL); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (EINVAL); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); else err = dsl_destroy_head(zc->zc_name); if (ost == DMU_OST_ZVOL && err == 0) (void) zvol_remove_minor(zc->zc_name); return (err); } /* * innvl: { * vdevs: { * guid 1, guid 2, ... * }, * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} * } * * outnvl: { * [func: EINVAL (if provided command type didn't make sense)], * [vdevs: { * guid1: errno, (see function body for possible errnos) * ... * }] * } * */ static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_DO || cmd_type == POOL_INITIALIZE_SUSPEND)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = 0; for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); error = spa_vdev_initialize(spa, vdev_guid, cmd_type); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } VFS_RELE(zfsvfs->z_vfs); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char fullname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); zfs_unmount_snap(fullname); return (0); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { if (ost == DMU_OST_ZVOL) (void) zvol_remove_minor(zc->zc_name); return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr)) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ return (SET_ERROR(EINVAL)); } if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (intval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } /* * If this is a bootable dataset then * verify that the compression algorithm * is supported for booting. We must return * something other than ENOTSUP since it * implies a downrev pool version. */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { return (SET_ERROR(ERANGE)); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_value_uint64(pair, &intval) != 0) return (SET_ERROR(EINVAL)); /* check prop value is enabled in features */ feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); break; } } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ static int zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; if (!spa_feature_is_active(spa, *featurep)) return (0); else return (SET_ERROR(EBUSY)); } /* * The callback invoked on feature activation in the sync task caused by * zfs_prop_activate_feature. */ static void zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; spa_feature_incr(spa, *featurep, tx); } /* * Activates a feature on a pool in response to a property setting. This * creates a new sync task which modifies the pool to reflect the feature * as being active. */ static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) { int err; /* EBUSY here indicates that the feature is already active */ err = dsl_sync_task(spa_name(spa), zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, &feature, 2, ZFS_SPACE_CHECK_RESERVED); if (err != 0 && err != EBUSY) return (err); else return (0); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * inputs: * zc_name name of containing filesystem * zc_nvlist_src{_size} nvlist of properties to apply * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; int error = 0; int props_error = 0; nvlist_t *errors; offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; fp = getf(fd); if (fp == NULL) { nvlist_free(props); return (SET_ERROR(EBADF)); } errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (props != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origprops, first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } } if (props != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { delayprops = extract_delay_props(props); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, props, errors); } } off = fp->f_offset; error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd, &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); if (error == 0) error = zfs_resume_fs(zfsvfs, ds); error = error ? error : end_err; VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, delayprops, errors); } } if (delayprops != NULL) { /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ ASSERT(nvlist_merge(props, delayprops, 0) == 0); nvlist_free(delayprops); } /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ props_error = SET_ERROR(EINVAL); } zc->zc_cookie = off - fp->f_offset; if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && props != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, props, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explictly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origprops, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } } out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); releasef(fd); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate(tosnap, fromsnap, compressok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { file_t *fp = getf(zc->zc_cookie); if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dsa_outfd == zc->zc_cookie && dsp->dsa_proc == curproc) break; } if (dsp != NULL) zc->zc_cookie = *(dsp->dsa_off); else error = SET_ERROR(ENOENT); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == NULL) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); return (SET_ERROR(ENODEV)); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, NULL, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_reopen(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If a resilver is already in progress then set the * spa_scrub_reopen flag to B_TRUE so that we don't restart * the scan as a side effect of the reopen. Otherwise, let * vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); VFS_RELE(zfsvfs->z_vfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele(os, FTAG); } return (error); } /* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); int zfs_nfsshare_inited; int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; kmutex_t zfs_share_lock; static int zfs_init_sharefs() { int error; ASSERT(MUTEX_HELD(&zfs_share_lock)); /* Both NFS and SMB shares also require sharetab support. */ if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } return (0); } static int zfs_ioc_share(zfs_cmd_t *zc) { int error; int opcode; switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (zfs_nfsshare_inited == 0) { mutex_enter(&zfs_share_lock); if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); } break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (zfs_smbshare_inited == 0) { mutex_enter(&zfs_share_lock); if (smbsrv_mod == NULL && ((smbsrv_mod = ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: return (SET_ERROR(EINVAL)); } switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (error = znfsexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata)) return (error); break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? B_TRUE: B_FALSE)) { return (error); } break; } opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? SHAREFS_ADD : SHAREFS_REMOVE; /* * Add or remove share from sharetab */ error = zshare_fs(opcode, (void *)(uintptr_t)zc->zc_share.z_sharedata, zc->zc_share.z_sharemax); return (error); } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { file_t *fp; offset_t off; int error; fp = getf(zc->zc_cookie); if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); return (error); } /* * Remove all ACL files in shares dir */ static int zfs_smb_acl_purge(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, NULL, 0)) != 0) break; } zap_cursor_fini(&zc); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; znode_t *sharedir; zfsvfs_t *zfsvfs; nvlist_t *nvlist; char *src, *target; vattr_t vattr; vsecattr_t vsec; int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (vp->v_vfsp->vfs_fstype != zfsfstype || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EINVAL)); } dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); /* * Create share dir if its missing. */ mutex_enter(&zfsvfs->z_lock); if (zfsvfs->z_shares_dir == 0) { dmu_tx_t *tx; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zfsvfs->z_lock); ASSERT(zfsvfs->z_shares_dir); if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } switch (zc->zc_cookie) { case ZFS_SMB_ACL_ADD: vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VREG; vattr.va_mode = S_IFREG|0777; vattr.va_uid = 0; vattr.va_gid = 0; vsec.vsa_mask = VSA_ACE; vsec.vsa_aclentp = &full_access; vsec.vsa_aclentsz = sizeof (full_access); vsec.vsa_aclcnt = 1; error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); if (resourcevp) VN_RELE(resourcevp); break; case ZFS_SMB_ACL_REMOVE: error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, NULL, 0); break; case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, &target)) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, kcred, NULL, 0); nvlist_free(nvlist); break; case ZFS_SMB_ACL_PURGE: error = zfs_smb_acl_purge(sharedir); break; default: error = SET_ERROR(EINVAL); break; } VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; error = nvlist_lookup_nvlist(args, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (error); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (error); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_name name of new filesystem or snapshot * zc_value full name of old snapshot * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } * * outnvl is unused */ /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { int error; offset_t off; char *fromname = NULL; int fd; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); file_t *fp = getf(fd); if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, fd, resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(fd); return (error); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; int error; char *fromname; boolean_t compressok; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } compressok = nvlist_exists(innvl, "compressok"); error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { if (strchr(fromname, '@') != NULL) { /* * If from is a snapshot, hold it and use the more * efficient dmu_send_estimate to estimate send space * size using deadlists. */ dsl_dataset_t *fromsnap; error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; error = dmu_send_estimate(tosnap, fromsnap, compressok, &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* * If from is a bookmark, fetch the creation TXG of the * snapshot it was created from and use that to find * blocks that were born after it. */ zfs_bookmark_phys_t frombm; error = dsl_bookmark_lookup(dp, fromname, tosnap, &frombm); if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, frombm.zbm_creation_txg, compressok, &space); } else { /* * from is not properly formatted as a snapshot or * bookmark */ error = SET_ERROR(EINVAL); goto out; } } else { /* * If estimating the size of a full send, use dmu_send_estimate. */ error = dmu_send_estimate(tosnap, NULL, compressok, &space); } fnvlist_add_uint64(outnvl, "space", space); out: dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ static void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } /* * Find a free minor number. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (ddi_get_soft_state(zfsdev_state, m) == NULL) { last_minor = m; return (m); } } return (0); } static int zfs_ctldev_init(dev_t *devp) { minor_t minor; zfs_soft_state_t *zs; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ASSERT(getminor(*devp) == 0); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) return (SET_ERROR(EAGAIN)); *devp = makedevice(getemajor(*devp), minor); zs = ddi_get_soft_state(zfsdev_state, minor); zs->zss_type = ZSST_CTLDEV; zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); return (0); } static void zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) { ASSERT(MUTEX_HELD(&zfsdev_state_lock)); zfs_onexit_destroy(zo); ddi_soft_state_free(zfsdev_state, minor); } void * zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) { zfs_soft_state_t *zp; zp = ddi_get_soft_state(zfsdev_state, minor); if (zp == NULL || zp->zss_type != which) return (NULL); return (zp->zss_data); } static int zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr) { int error = 0; if (getminor(*devp) != 0) return (zvol_open(devp, flag, otyp, cr)); /* This is the control device. Allocate a new minor if requested. */ if (flag & FEXCL) { mutex_enter(&zfsdev_state_lock); error = zfs_ctldev_init(devp); mutex_exit(&zfsdev_state_lock); } return (error); } static int zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr) { zfs_onexit_t *zo; minor_t minor = getminor(dev); if (minor == 0) return (0); mutex_enter(&zfsdev_state_lock); zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); if (zo == NULL) { mutex_exit(&zfsdev_state_lock); return (zvol_close(dev, flag, otyp, cr)); } zfs_ctldev_destroy(zo, minor); mutex_exit(&zfsdev_state_lock); return (0); } static int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) { zfs_cmd_t *zc; uint_t vecnum; int error, rc, len; minor_t minor = getminor(dev); const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; nvlist_t *innvl = NULL; if (minor != 0 && zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL) return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); vecnum = cmd - ZFS_IOC_FIRST; ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(EINVAL)); vec = &zfs_ioc_vec[vecnum]; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } zc->zc_iflags = flag & FKIOCTL; if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case NO_NAME: break; } if (error == 0) error = vec->zvec_secpolicy(zc, innvl, cr); if (error != 0) goto out; /* legacy ioctls can modify zc_name */ len = strcspn(zc->zc_name, "/@#") + 1; saved_poolname = kmem_alloc(len, KM_SLEEP); (void) strlcpy(saved_poolname, zc->zc_name, len); if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); /* * Some commands can partially execute, modfiy state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { error = vec->zvec_legacy_func(zc); } out: nvlist_free(innvl); rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) strfree(s); (void) tsd_set(zfs_allow_log_key, saved_poolname); } else { if (saved_poolname != NULL) strfree(saved_poolname); } kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } static int zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); zfs_dip = dip; ddi_report_dev(dip); return (DDI_SUCCESS); } static int zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (spa_busy() || zfs_busy() || zvol_busy()) return (DDI_FAILURE); if (cmd != DDI_DETACH) return (DDI_FAILURE); zfs_dip = NULL; ddi_prop_remove_all(dip); ddi_remove_minor_node(dip, NULL); return (DDI_SUCCESS); } /*ARGSUSED*/ static int zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = zfs_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; return (DDI_SUCCESS); } return (DDI_FAILURE); } /* * OK, so this is a little weird. * * /dev/zfs is the control node, i.e. minor 0. * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. * * /dev/zfs has basically nothing to do except serve up ioctls, * so most of the standard driver entry points are in zvol.c. */ static struct cb_ops zfs_cb_ops = { zfsdev_open, /* open */ zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ zvol_read, /* read */ zvol_write, /* write */ zfsdev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ CB_REV, /* version */ nodev, /* async read */ nodev, /* async write */ }; static struct dev_ops zfs_dev_ops = { DEVO_REV, /* version */ 0, /* refcnt */ zfs_info, /* info */ nulldev, /* identify */ nulldev, /* probe */ zfs_attach, /* attach */ zfs_detach, /* detach */ nodev, /* reset */ &zfs_cb_ops, /* driver operations */ NULL, /* no bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zfs_modldrv = { &mod_driverops, "ZFS storage pool", &zfs_dev_ops }; static struct modlinkage modlinkage = { MODREV_1, (void *)&zfs_modlfs, (void *)&zfs_modldrv, NULL }; static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; strfree(poolname); } int _init(void) { int error; spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); if ((error = mod_install(&modlinkage)) != 0) { zvol_fini(); zfs_fini(); spa_fini(); return (error); } tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } int _fini(void) { int error; if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) return (SET_ERROR(EBUSY)); if ((error = mod_remove(&modlinkage)) != 0) return (error); zvol_fini(); zfs_fini(); spa_fini(); if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) (void) ddi_modclose(smbsrv_mod); if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } Index: vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h =================================================================== --- vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h (revision 342531) +++ vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h (revision 342532) @@ -1,1148 +1,1150 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4) } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, /* not exposed to the user */ ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_BOOTSIZE, ZPOOL_PROP_CHECKPOINT, + ZPOOL_PROP_TNAME, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_visible(zfs_prop_t prop); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); boolean_t zfs_prop_written(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST } zfs_redundant_metadata_type_t; /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.open-zfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* * This is needed in userland to report the minimum necessary device size. * * Note that the zfs test suite uses 64MB vdevs. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_CHILDREN_OFFLINE /* all children are offline */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_DO, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total examined bytes */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ } vdev_stat_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { - uint64_t ddo_count; /* number of elments in ddt */ + uint64_t ddo_count; /* number of elments in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_DISK_ROOT "/dev/dsk" #define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/" #define ZFS_RDISK_ROOT "/dev/rdsk" #define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/" /* general zvol path */ #define ZVOL_DIR "/dev/zvol" /* expansion */ #define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" /* for dump and swap */ #define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" #define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. */ typedef enum zfs_ioc { ZFS_IOC_FIRST = ('Z' << 8), ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_SPACE_SNAPS, ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_POOL_REGUID, ZFS_IOC_POOL_REOPEN, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_LOG_HISTORY, ZFS_IOC_SEND_NEW, ZFS_IOC_SEND_SPACE, ZFS_IOC_CLONE, ZFS_IOC_BOOKMARK, ZFS_IOC_GET_BOOKMARKS, ZFS_IOC_DESTROY_BOOKMARKS, ZFS_IOC_CHANNEL_PROGRAM, ZFS_IOC_REMAP, ZFS_IOC_POOL_CHECKPOINT, ZFS_IOC_POOL_DISCARD_CHECKPOINT, ZFS_IOC_POOL_INITIALIZE, ZFS_IOC_LAST } zfs_ioc_t; /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_CHECKPOINT 0x10 +#define ZFS_IMPORT_TEMP_NAME 0x20 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */